diff options
Diffstat (limited to 'third_party/aom/aom_dsp')
76 files changed, 8530 insertions, 4578 deletions
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 3ce6761ca..11b55caa7 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -23,6 +23,7 @@ set(AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/intrapred_common.h" "${AOM_ROOT}/aom_dsp/loopfilter.c" "${AOM_ROOT}/aom_dsp/prob.c" "${AOM_ROOT}/aom_dsp/prob.h" @@ -45,7 +46,9 @@ set(AOM_DSP_COMMON_ASM_SSE2 set(AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c") set(AOM_DSP_COMMON_ASM_SSSE3 @@ -55,6 +58,7 @@ set(AOM_DSP_COMMON_ASM_SSSE3 set(AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c") set(AOM_DSP_COMMON_INTRIN_SSE4_1 @@ -64,16 +68,28 @@ set(AOM_DSP_COMMON_INTRIN_SSE4_1 set(AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_AVX2 + ${AOM_DSP_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") +endif () + +if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_ASM_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm") +endif () + set(AOM_DSP_COMMON_ASM_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm" + ${AOM_DSP_COMMON_ASM_NEON} "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm" @@ -83,33 +99,53 @@ set(AOM_DSP_COMMON_ASM_NEON "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm" "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_ASM_NEON + ${AOM_DSP_COMMON_ASM_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm") +endif () + +if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c") +endif () + set(AOM_DSP_COMMON_INTRIN_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c" + ${AOM_DSP_COMMON_INTRIN_NEON} "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c") +endif () + if ("${AOM_TARGET_CPU}" STREQUAL "arm64") + if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c") + endif () + set(AOM_DSP_COMMON_INTRIN_NEON ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c" @@ -118,10 +154,15 @@ if ("${AOM_TARGET_CPU}" STREQUAL "arm64") "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c") + + if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") + endif () endif () set(AOM_DSP_COMMON_INTRIN_DSPR2 @@ -141,14 +182,19 @@ set(AOM_DSP_COMMON_INTRIN_DSPR2 "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") + +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_DSPR2 + ${AOM_DSP_COMMON_INTRIN_DSPR2} + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") +endif () set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" @@ -169,13 +215,18 @@ set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c" "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h" "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_MSA + ${AOM_DSP_COMMON_INTRIN_MSA} + "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h") +endif () + if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_COMMON_ASM_SSE2 ${AOM_DSP_COMMON_ASM_SSE2} @@ -185,11 +236,18 @@ if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_COMMON_INTRIN_SSE2 ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") + set(AOM_DSP_COMMON_INTRIN_SSSE3 + ${AOM_DSP_COMMON_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c") + set(AOM_DSP_COMMON_INTRIN_AVX2 ${AOM_DSP_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") else () set(AOM_DSP_COMMON_INTRIN_DSPR2 ${AOM_DSP_COMMON_INTRIN_DSPR2} @@ -332,12 +390,10 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") - if (CONFIG_EXT_INTER) set(AOM_DSP_ENCODER_INTRIN_SSSE3 ${AOM_DSP_ENCODER_INTRIN_SSSE3} "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") - endif () if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_ENCODER_INTRIN_SSE2 diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk index f9d675ac0..950db0216 100644 --- a/third_party/aom/aom_dsp/aom_dsp.mk +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -64,6 +64,7 @@ endif # intra predictions DSP_SRCS-yes += intrapred.c +DSP_SRCS-yes += intrapred_common.h ifneq ($(CONFIG_ANS),yes) DSP_SRCS-yes += entcode.c @@ -75,9 +76,16 @@ DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c +DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c + ifeq ($(CONFIG_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c endif # CONFIG_HIGHBITDEPTH DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) @@ -120,6 +128,7 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c endif DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm +ifneq ($(CONFIG_EXT_PARTITION),yes) ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM) DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM) @@ -135,6 +144,7 @@ DSP_SRCS-yes += arm/aom_convolve_avg_neon.c DSP_SRCS-yes += arm/aom_convolve_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM +endif # CONFIG_EXT_PARTITION # common (msa) DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c @@ -164,7 +174,10 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c DSP_SRCS-yes += loopfilter.c DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/lpf_common_sse2.h + +ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c ifeq ($(HAVE_NEON_ASM),yes) @@ -191,13 +204,16 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c +endif # !CONFIG_PARALLEL_DEBLOCKING ifeq ($(CONFIG_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_loopfilter_avx2.c endif # CONFIG_HIGHBITDEPTH DSP_SRCS-yes += txfm_common.h DSP_SRCS-yes += x86/txfm_common_intrin.h +DSP_SRCS-$(HAVE_AVX2) += x86/common_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/obmc_intrinsic_ssse3.h DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h @@ -343,10 +359,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c endif ifeq ($(CONFIG_AV1_ENCODER),yes) -ifeq ($(CONFIG_EXT_INTER),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c -endif #CONFIG_EXT_INTER ifeq ($(CONFIG_MOTION_VAR),yes) DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index 5b104321b..3d3bcba37 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -52,10 +52,9 @@ extern "C" { #define UNLIKELY(v) (v) #endif -#if CONFIG_AOM_QM typedef uint16_t qm_val_t; #define AOM_QM_BITS 5 -#endif + #if CONFIG_HIGHBITDEPTH // Note: // tran_low_t is the datatype used for final transform coefficients. @@ -78,6 +77,10 @@ static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } +static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { return value < low ? low : (value > high ? high : value); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 0c0356870..f4f6c64d4 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -40,11 +40,17 @@ foreach $w (@block_widths) { push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ; } } -if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { +if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") { push @block_sizes, [4, 16]; push @block_sizes, [16, 4]; push @block_sizes, [8, 32]; push @block_sizes, [32, 8]; + push @block_sizes, [16, 64]; + push @block_sizes, [64, 16]; + if (aom_config("CONFIG_EXT_PARTITION") eq "yes") { + push @block_sizes, [32, 128]; + push @block_sizes, [128, 32]; + } } @tx_dims = (2, 4, 8, 16, 32); @@ -60,14 +66,9 @@ foreach $w (@tx_dims) { } } -@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/; -if (aom_config("CONFIG_ALT_INTRA") eq "yes") { - push @pred_names, qw/paeth smooth/; - if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { - push @pred_names, qw/smooth_v smooth_h/; - } -} else { - push @pred_names, 'tm'; +@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/; +if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { + push @pred_names, qw/smooth_v smooth_h/; } # @@ -86,70 +87,185 @@ foreach (@tx_sizes) { } } -specialize qw/aom_d63e_predictor_4x4 ssse3/; -specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; -specialize qw/aom_d135_predictor_4x4 neon/; -specialize qw/aom_d153_predictor_4x4 ssse3/; -specialize qw/aom_v_predictor_4x4 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/; -} # CONFIG_ALT_INTRA -specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_top_predictor_4x8 sse2/; +specialize qw/aom_dc_top_predictor_8x4 sse2/; +specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_top_predictor_8x16 sse2/; +specialize qw/aom_dc_top_predictor_16x8 sse2/; +specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_top_predictor_16x32 sse2/; +specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_left_predictor_4x8 sse2/; +specialize qw/aom_dc_left_predictor_8x4 sse2/; +specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_left_predictor_8x16 sse2/; +specialize qw/aom_dc_left_predictor_16x8 sse2/; +specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_left_predictor_16x32 sse2/; +specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/; specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_128_predictor_4x8 sse2/; +specialize qw/aom_dc_128_predictor_8x4 sse2/; +specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_128_predictor_8x16 sse2/; +specialize qw/aom_dc_128_predictor_16x8 sse2/; +specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_128_predictor_16x32 sse2/; +specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_v_predictor_4x4 neon msa sse2/; +specialize qw/aom_v_predictor_4x8 sse2/; +specialize qw/aom_v_predictor_8x4 sse2/; +specialize qw/aom_v_predictor_8x8 neon msa sse2/; +specialize qw/aom_v_predictor_8x16 sse2/; +specialize qw/aom_v_predictor_16x8 sse2/; +specialize qw/aom_v_predictor_16x16 neon msa sse2/; +specialize qw/aom_v_predictor_16x32 sse2/; +specialize qw/aom_v_predictor_32x16 sse2 avx2/; +specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_h_predictor_4x8 sse2/; +specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x4 sse2/; specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x16 sse2/; +specialize qw/aom_h_predictor_16x8 sse2/; +specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_16x32 sse2/; +specialize qw/aom_h_predictor_32x16 sse2/; +specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_paeth_predictor_4x4 ssse3/; +specialize qw/aom_paeth_predictor_4x8 ssse3/; +specialize qw/aom_paeth_predictor_8x4 ssse3/; +specialize qw/aom_paeth_predictor_8x8 ssse3/; +specialize qw/aom_paeth_predictor_8x16 ssse3/; +specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x8 ssse3/; +specialize qw/aom_paeth_predictor_16x16 ssse3/; +specialize qw/aom_paeth_predictor_16x32 ssse3/; +specialize qw/aom_paeth_predictor_32x16 ssse3/; +specialize qw/aom_paeth_predictor_32x32 ssse3/; +specialize qw/aom_smooth_predictor_4x4 ssse3/; +specialize qw/aom_smooth_predictor_4x8 ssse3/; +specialize qw/aom_smooth_predictor_8x4 ssse3/; +specialize qw/aom_smooth_predictor_8x8 ssse3/; +specialize qw/aom_smooth_predictor_8x16 ssse3/; +specialize qw/aom_smooth_predictor_16x8 ssse3/; +specialize qw/aom_smooth_predictor_16x16 ssse3/; +specialize qw/aom_smooth_predictor_16x32 ssse3/; +specialize qw/aom_smooth_predictor_32x16 ssse3/; +specialize qw/aom_smooth_predictor_32x32 ssse3/; + +specialize qw/aom_d63e_predictor_4x4 ssse3/; +specialize qw/aom_d135_predictor_4x4 neon/; +specialize qw/aom_d153_predictor_4x4 ssse3/; +specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; +specialize qw/aom_dc_predictor_4x8 sse2/; specialize qw/aom_d153_predictor_8x8 ssse3/; -specialize qw/aom_v_predictor_8x8 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/; -} # CONFIG_ALT_INTRA +specialize qw/aom_dc_predictor_8x4 sse2/; specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; -specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; -specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/aom_dc_predictor_8x16 sse2/; specialize qw/aom_d153_predictor_16x16 ssse3/; -specialize qw/aom_v_predictor_16x16 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_16x16 neon msa sse2/; -} # CONFIG_ALT_INTRA +specialize qw/aom_dc_predictor_16x8 sse2/; specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; -specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; -specialize qw/aom_h_predictor_32x32 neon msa sse2/; +specialize qw/aom_dc_predictor_16x32 sse2/; specialize qw/aom_d153_predictor_32x32 ssse3/; -specialize qw/aom_v_predictor_32x32 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_32x32 neon msa sse2/; -} # CONFIG_ALT_INTRA -specialize qw/aom_dc_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/; + +specialize qw/aom_dc_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_v_predictor_4x4 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_4x4 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_4x4 sse2/; + specialize qw/aom_highbd_v_predictor_4x8 sse2/; + specialize qw/aom_highbd_v_predictor_8x4 sse2/; specialize qw/aom_highbd_v_predictor_8x8 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_8x8 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; + specialize qw/aom_highbd_v_predictor_8x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x8 sse2/; specialize qw/aom_highbd_v_predictor_16x16 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_16x16 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_16x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x32 sse2/; + specialize qw/aom_highbd_v_predictor_32x16 sse2/; specialize qw/aom_highbd_v_predictor_32x32 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_32x32 sse2/; - } # CONFIG_ALT_INTRA + specialize qw/aom_highbd_dc_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; + specialize qw/aom_highbd_dc_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_predictor_32x16 sse2/; specialize qw/aom_highbd_dc_predictor_32x32 sse2/; + specialize qw/aom_highbd_h_predictor_4x4 sse2/; + specialize qw/aom_highbd_h_predictor_4x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x4 sse2/; + specialize qw/aom_highbd_h_predictor_8x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x8 sse2/; + specialize qw/aom_highbd_h_predictor_16x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x32 sse2/; + specialize qw/aom_highbd_h_predictor_32x16 sse2/; + specialize qw/aom_highbd_h_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; + + specialize qw/aom_highbd_d117_predictor_4x4 sse2/; + specialize qw/aom_highbd_d117_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d117_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d117_predictor_32x32 ssse3/; + specialize qw/aom_highbd_d135_predictor_4x4 sse2/; + specialize qw/aom_highbd_d135_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d135_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d135_predictor_32x32 ssse3/; + specialize qw/aom_highbd_d153_predictor_4x4 sse2/; + specialize qw/aom_highbd_d153_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d153_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d153_predictor_32x32 ssse3/; + + specialize qw/aom_highbd_d45e_predictor_4x4 sse2/; + specialize qw/aom_highbd_d45e_predictor_4x8 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x4 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x8 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x16 sse2/; + specialize qw/aom_highbd_d45e_predictor_16x8 avx2/; + specialize qw/aom_highbd_d45e_predictor_16x16 avx2/; + specialize qw/aom_highbd_d45e_predictor_16x32 avx2/; + specialize qw/aom_highbd_d45e_predictor_32x16 avx2/; + specialize qw/aom_highbd_d45e_predictor_32x32 avx2/; } # CONFIG_HIGHBITDEPTH # @@ -257,83 +373,121 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # Loopfilter # add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_16 sse2/; +} else { + specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; +} add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; +} add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_8 sse2/; +} else { + specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; +} add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_4 sse2/; +} else { + specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_edge_8 sse2/; +} else { + specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon; +} add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_edge_16 sse2/; +} else { + specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon; +} add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_8 sse2/; +} else { + specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; +} add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_4 sse2/; +} else { + specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +} if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_16 sse2/; add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_8 sse2/; add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_8_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_4 sse2/; add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_4_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2/; + specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_8 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8_dual sse2/; + specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_4 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4_dual sse2/; + specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; } # CONFIG_HIGHBITDEPTH # @@ -412,51 +566,48 @@ if (aom_config("CONFIG_AV1") eq "yes") { add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - { - add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_16_add sse2/; + add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct4x4_16_add sse2/; - add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_1_add sse2/; + add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct4x4_1_add sse2/; - add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_64_add sse2 ssse3/; + add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_64_add sse2 ssse3/; - add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_12_add sse2 ssse3/; + add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_12_add sse2 ssse3/; - add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_1_add sse2/; + add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_1_add sse2/; - add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2 avx2/; + add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_256_add sse2 avx2/; - add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_38_add avx2/; + add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_38_add avx2/; - add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2 avx2/; + add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_10_add sse2 avx2/; - add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2 avx2/; + add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_1_add sse2 avx2/; - add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; + add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; - add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; - # Need to add 135 eob idct32x32 implementations. - $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; + add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; + # Need to add 135 eob idct32x32 implementations. + $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; - add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; + add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; - add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2 avx2/; - } -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { -} else { - { + add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_1_add sse2 avx2/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + } else { add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/; @@ -508,48 +659,32 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_iwht4x4_16_add msa sse2/; - } -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH } # CONFIG_AV1 # # Quantization # -if (aom_config("CONFIG_AOM_QM") eq "yes") { - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - } # CONFIG_AV1_ENCODER -} else { - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +} # CONFIG_AV1_ENCODER - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b sse2 avx2/; +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2/; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b_32x32 sse2/; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2/; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - } # CONFIG_AV1_ENCODER -} # CONFIG_AOM_QM +} # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1") eq "yes") { # # Alpha blending with mask @@ -575,147 +710,146 @@ if (aom_config("CONFIG_AV1") eq "yes") { } # CONFIG_AV1 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Block subtraction -# -add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/aom_subtract_block neon msa sse2/; - -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Sum of Squares -# -add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; -specialize qw/aom_sum_squares_2d_i16 sse2/; - -add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; -specialize qw/aom_sum_squares_i16 sse2/; -} - - -# -# Avg -# -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # - # Avg + # Block subtraction # - specialize qw/aom_avg_8x8 sse2 neon msa/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/aom_highbd_subtract_block sse2/; + add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; + specialize qw/aom_subtract_block neon msa sse2/; + + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Sum of Squares + # + add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; + specialize qw/aom_sum_squares_2d_i16 sse2/; + + add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; + specialize qw/aom_sum_squares_i16 sse2/; } + # - # Minmax + # Avg # - add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - specialize qw/aom_minmax_8x8 sse2 neon/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - } - - add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Avg + # + specialize qw/aom_avg_8x8 sse2 neon msa/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/aom_highbd_subtract_block sse2/; + } - add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_16x16 sse2 neon/; + # + # Minmax + # + add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_minmax_8x8 sse2 neon/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + } - add_proto qw/int aom_satd/, "const int16_t *coeff, int length"; - specialize qw/aom_satd sse2 neon/; + add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; - add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height"; - specialize qw/aom_int_pro_row sse2 neon/; + add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_16x16 sse2 neon/; - add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width"; - specialize qw/aom_int_pro_col sse2 neon/; + add_proto qw/int aom_satd/, "const int16_t *coeff, int length"; + specialize qw/aom_satd sse2 neon/; - add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; - specialize qw/aom_vector_var neon sse2/; -} # CONFIG_AV1_ENCODER + add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height"; + specialize qw/aom_int_pro_row sse2 neon/; -# -# Single block SAD / Single block Avg SAD -# -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -} + add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width"; + specialize qw/aom_int_pro_col sse2 neon/; -specialize qw/aom_sad128x128 avx2 sse2/; -specialize qw/aom_sad128x64 avx2 sse2/; -specialize qw/aom_sad64x128 avx2 sse2/; -specialize qw/aom_sad64x64 avx2 neon msa sse2/; -specialize qw/aom_sad64x32 avx2 msa sse2/; -specialize qw/aom_sad32x64 avx2 msa sse2/; -specialize qw/aom_sad32x32 avx2 neon msa sse2/; -specialize qw/aom_sad32x16 avx2 msa sse2/; -specialize qw/aom_sad16x32 msa sse2/; -specialize qw/aom_sad16x16 neon msa sse2/; -specialize qw/aom_sad16x8 neon msa sse2/; -specialize qw/aom_sad8x16 neon msa sse2/; -specialize qw/aom_sad8x8 neon msa sse2/; -specialize qw/aom_sad8x4 msa sse2/; -specialize qw/aom_sad4x8 msa sse2/; -specialize qw/aom_sad4x4 neon msa sse2/; - -specialize qw/aom_sad128x128_avg avx2 sse2/; -specialize qw/aom_sad128x64_avg avx2 sse2/; -specialize qw/aom_sad64x128_avg avx2 sse2/; -specialize qw/aom_sad64x64_avg avx2 msa sse2/; -specialize qw/aom_sad64x32_avg avx2 msa sse2/; -specialize qw/aom_sad32x64_avg avx2 msa sse2/; -specialize qw/aom_sad32x32_avg avx2 msa sse2/; -specialize qw/aom_sad32x16_avg avx2 msa sse2/; -specialize qw/aom_sad16x32_avg msa sse2/; -specialize qw/aom_sad16x16_avg msa sse2/; -specialize qw/aom_sad16x8_avg msa sse2/; -specialize qw/aom_sad8x16_avg msa sse2/; -specialize qw/aom_sad8x8_avg msa sse2/; -specialize qw/aom_sad8x4_avg msa sse2/; -specialize qw/aom_sad4x8_avg msa sse2/; -specialize qw/aom_sad4x4_avg msa sse2/; + add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; + specialize qw/aom_vector_var neon sse2/; + } # CONFIG_AV1_ENCODER -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # Single block SAD / Single block Avg SAD + # foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_sad${w}x${h}", qw/sse2/; - specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + } + + specialize qw/aom_sad128x128 avx2 sse2/; + specialize qw/aom_sad128x64 avx2 sse2/; + specialize qw/aom_sad64x128 avx2 sse2/; + specialize qw/aom_sad64x64 avx2 neon msa sse2/; + specialize qw/aom_sad64x32 avx2 msa sse2/; + specialize qw/aom_sad32x64 avx2 msa sse2/; + specialize qw/aom_sad32x32 avx2 neon msa sse2/; + specialize qw/aom_sad32x16 avx2 msa sse2/; + specialize qw/aom_sad16x32 msa sse2/; + specialize qw/aom_sad16x16 neon msa sse2/; + specialize qw/aom_sad16x8 neon msa sse2/; + specialize qw/aom_sad8x16 neon msa sse2/; + specialize qw/aom_sad8x8 neon msa sse2/; + specialize qw/aom_sad8x4 msa sse2/; + specialize qw/aom_sad4x8 msa sse2/; + specialize qw/aom_sad4x4 neon msa sse2/; + + specialize qw/aom_sad128x128_avg avx2 sse2/; + specialize qw/aom_sad128x64_avg avx2 sse2/; + specialize qw/aom_sad64x128_avg avx2 sse2/; + specialize qw/aom_sad64x64_avg avx2 msa sse2/; + specialize qw/aom_sad64x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x64_avg avx2 msa sse2/; + specialize qw/aom_sad32x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x16_avg avx2 msa sse2/; + specialize qw/aom_sad16x32_avg msa sse2/; + specialize qw/aom_sad16x16_avg msa sse2/; + specialize qw/aom_sad16x8_avg msa sse2/; + specialize qw/aom_sad8x16_avg msa sse2/; + specialize qw/aom_sad8x8_avg msa sse2/; + specialize qw/aom_sad8x4_avg msa sse2/; + specialize qw/aom_sad4x8_avg msa sse2/; + specialize qw/aom_sad4x4_avg msa sse2/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_sad${w}x${h}", qw/sse2/; + specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + } } + specialize qw/aom_highbd_sad128x128 avx2/; + specialize qw/aom_highbd_sad128x64 avx2/; + specialize qw/aom_highbd_sad64x128 avx2/; + specialize qw/aom_highbd_sad64x64 avx2/; + specialize qw/aom_highbd_sad64x32 avx2/; + specialize qw/aom_highbd_sad32x64 avx2/; + specialize qw/aom_highbd_sad32x32 avx2/; + specialize qw/aom_highbd_sad32x16 avx2/; + specialize qw/aom_highbd_sad16x32 avx2/; + specialize qw/aom_highbd_sad16x16 avx2/; + specialize qw/aom_highbd_sad16x8 avx2/; + + specialize qw/aom_highbd_sad128x128_avg avx2/; + specialize qw/aom_highbd_sad128x64_avg avx2/; + specialize qw/aom_highbd_sad64x128_avg avx2/; + specialize qw/aom_highbd_sad64x64_avg avx2/; + specialize qw/aom_highbd_sad64x32_avg avx2/; + specialize qw/aom_highbd_sad32x64_avg avx2/; + specialize qw/aom_highbd_sad32x32_avg avx2/; + specialize qw/aom_highbd_sad32x16_avg avx2/; + specialize qw/aom_highbd_sad16x32_avg avx2/; + specialize qw/aom_highbd_sad16x16_avg avx2/; + specialize qw/aom_highbd_sad16x8_avg avx2/; } - specialize qw/aom_highbd_sad128x128 avx2/; - specialize qw/aom_highbd_sad128x64 avx2/; - specialize qw/aom_highbd_sad64x128 avx2/; - specialize qw/aom_highbd_sad64x64 avx2/; - specialize qw/aom_highbd_sad64x32 avx2/; - specialize qw/aom_highbd_sad32x64 avx2/; - specialize qw/aom_highbd_sad32x32 avx2/; - specialize qw/aom_highbd_sad32x16 avx2/; - specialize qw/aom_highbd_sad16x32 avx2/; - specialize qw/aom_highbd_sad16x16 avx2/; - specialize qw/aom_highbd_sad16x8 avx2/; - - specialize qw/aom_highbd_sad128x128_avg avx2/; - specialize qw/aom_highbd_sad128x64_avg avx2/; - specialize qw/aom_highbd_sad64x128_avg avx2/; - specialize qw/aom_highbd_sad64x64_avg avx2/; - specialize qw/aom_highbd_sad64x32_avg avx2/; - specialize qw/aom_highbd_sad32x64_avg avx2/; - specialize qw/aom_highbd_sad32x32_avg avx2/; - specialize qw/aom_highbd_sad32x16_avg avx2/; - specialize qw/aom_highbd_sad16x32_avg avx2/; - specialize qw/aom_highbd_sad16x16_avg avx2/; - specialize qw/aom_highbd_sad16x8_avg avx2/; -} -# -# Masked SAD -# -if (aom_config("CONFIG_EXT_INTER") eq "yes") { + # + # Masked SAD + # foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; @@ -729,318 +863,326 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; } } -} - -# -# OBMC SAD -# -if (aom_config("CONFIG_MOTION_VAR") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; - } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # OBMC SAD + # + if (aom_config("CONFIG_MOTION_VAR") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; + } + } + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + } + } } } -} -# -# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally -# -# Blocks of 3 -foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} -specialize qw/aom_sad64x64x3 msa/; -specialize qw/aom_sad32x32x3 msa/; -specialize qw/aom_sad16x16x3 sse3 ssse3 msa/; -specialize qw/aom_sad8x8x3 sse3 msa/; -specialize qw/aom_sad4x4x3 sse3 msa/; - -add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad16x8x3 sse3 ssse3 msa/; -add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x16x3 sse3 msa/; - -# Blocks of 8 -foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} -specialize qw/aom_sad64x64x8 msa/; -specialize qw/aom_sad32x32x8 msa/; -specialize qw/aom_sad16x16x8 sse4_1 msa/; -specialize qw/aom_sad8x8x8 sse4_1 msa/; -specialize qw/aom_sad4x4x8 sse4_1 msa/; - -add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad16x8x8 sse4_1 msa/; -add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x16x8 sse4_1 msa/; -add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x4x8 msa/; -add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad4x8x8 msa/; + # + # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally + # + # Blocks of 3 + foreach $s (@block_widths) { + add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } + specialize qw/aom_sad64x64x3 msa/; + specialize qw/aom_sad32x32x3 msa/; + specialize qw/aom_sad16x16x3 sse3 ssse3 msa/; + specialize qw/aom_sad8x8x3 sse3 msa/; + specialize qw/aom_sad4x4x3 sse3 msa/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad16x8x3 sse3 ssse3 msa/; + add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x16x3 sse3 msa/; + + # Blocks of 8 foreach $s (@block_widths) { + add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } + specialize qw/aom_sad64x64x8 msa/; + specialize qw/aom_sad32x32x8 msa/; + specialize qw/aom_sad16x16x8 sse4_1 msa/; + specialize qw/aom_sad8x8x8 sse4_1 msa/; + specialize qw/aom_sad4x4x8 sse4_1 msa/; + + add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad16x8x8 sse4_1 msa/; + add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x16x8 sse4_1 msa/; + add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x4x8 msa/; + add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad4x8x8 msa/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $s (@block_widths) { + # Blocks of 3 + add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + # Blocks of 8 + add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; } - # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} - -# -# Multi-block SAD, comparing a reference to N independent blocks -# -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -} - -specialize qw/aom_sad128x128x4d avx2 sse2/; -specialize qw/aom_sad128x64x4d avx2 sse2/; -specialize qw/aom_sad64x128x4d avx2 sse2/; -specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; -specialize qw/aom_sad64x32x4d avx2 msa sse2/; -specialize qw/aom_sad32x64x4d avx2 msa sse2/; -specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; -specialize qw/aom_sad32x16x4d msa sse2/; -specialize qw/aom_sad16x32x4d msa sse2/; -specialize qw/aom_sad16x16x4d neon msa sse2/; -specialize qw/aom_sad16x8x4d msa sse2/; -specialize qw/aom_sad8x16x4d msa sse2/; -specialize qw/aom_sad8x8x4d msa sse2/; -specialize qw/aom_sad8x4x4d msa sse2/; -specialize qw/aom_sad4x8x4d msa sse2/; -specialize qw/aom_sad4x4x4d msa sse2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; - if ($w != 128 && $h != 128) { - specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + } + + specialize qw/aom_sad128x128x4d avx2 sse2/; + specialize qw/aom_sad128x64x4d avx2 sse2/; + specialize qw/aom_sad64x128x4d avx2 sse2/; + specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; + specialize qw/aom_sad64x32x4d avx2 msa sse2/; + specialize qw/aom_sad32x64x4d avx2 msa sse2/; + specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; + specialize qw/aom_sad32x16x4d msa sse2/; + specialize qw/aom_sad16x32x4d msa sse2/; + specialize qw/aom_sad16x16x4d neon msa sse2/; + specialize qw/aom_sad16x8x4d msa sse2/; + specialize qw/aom_sad8x16x4d msa sse2/; + specialize qw/aom_sad8x8x4d msa sse2/; + specialize qw/aom_sad8x4x4d msa sse2/; + specialize qw/aom_sad4x8x4d msa sse2/; + specialize qw/aom_sad4x4x4d msa sse2/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # Multi-block SAD, comparing a reference to N independent blocks + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + if ($w != 128 && $h != 128) { + specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + } } + specialize qw/aom_highbd_sad128x128x4d avx2/; + specialize qw/aom_highbd_sad128x64x4d avx2/; + specialize qw/aom_highbd_sad64x128x4d avx2/; + specialize qw/aom_highbd_sad64x64x4d avx2/; + specialize qw/aom_highbd_sad64x32x4d avx2/; + specialize qw/aom_highbd_sad32x64x4d avx2/; + specialize qw/aom_highbd_sad32x32x4d avx2/; + specialize qw/aom_highbd_sad32x16x4d avx2/; + specialize qw/aom_highbd_sad16x32x4d avx2/; + specialize qw/aom_highbd_sad16x16x4d avx2/; + specialize qw/aom_highbd_sad16x8x4d avx2/; } - specialize qw/aom_highbd_sad128x128x4d avx2/; - specialize qw/aom_highbd_sad128x64x4d avx2/; - specialize qw/aom_highbd_sad64x128x4d avx2/; - specialize qw/aom_highbd_sad64x64x4d avx2/; - specialize qw/aom_highbd_sad64x32x4d avx2/; - specialize qw/aom_highbd_sad32x64x4d avx2/; - specialize qw/aom_highbd_sad32x32x4d avx2/; - specialize qw/aom_highbd_sad32x16x4d avx2/; - specialize qw/aom_highbd_sad16x32x4d avx2/; - specialize qw/aom_highbd_sad16x16x4d avx2/; - specialize qw/aom_highbd_sad16x8x4d avx2/; -} -# -# Structured Similarity (SSIM) -# -if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { - add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; + # + # Structured Similarity (SSIM) + # + if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; - add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; + add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + } } -} } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Specialty Variance -# -add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + # + # Specialty Variance + # + add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/aom_get16x16var sse2 avx2 neon msa/; -specialize qw/aom_get8x8var sse2 neon msa/; + specialize qw/aom_get16x16var sse2 avx2 neon msa/; + specialize qw/aom_get8x8var sse2 neon msa/; -add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/aom_mse16x16 sse2 avx2 neon msa/; -specialize qw/aom_mse16x8 sse2 msa/; -specialize qw/aom_mse8x16 sse2 msa/; -specialize qw/aom_mse8x8 sse2 msa/; + specialize qw/aom_mse16x16 sse2 avx2 neon msa/; + specialize qw/aom_mse16x8 sse2 msa/; + specialize qw/aom_mse8x16 sse2 msa/; + specialize qw/aom_mse8x8 sse2 msa/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd (8, 10, 12) { - add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; + specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; + } } -} -# -# ... -# -add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; -specialize qw/aom_upsampled_pred sse2/; -add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; -specialize qw/aom_comp_avg_upsampled_pred sse2/; + # + # ... + # + add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + specialize qw/aom_upsampled_pred sse2/; + add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + specialize qw/aom_comp_avg_upsampled_pred sse2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_upsampled_pred sse2/; - add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; -} + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_upsampled_pred sse2/; + add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; + } -# -# ... -# -add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; -add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + # + # ... + # + add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; + add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/aom_get_mb_ss sse2 msa/; -specialize qw/aom_get4x4sse_cs neon msa/; + specialize qw/aom_get_mb_ss sse2 msa/; + specialize qw/aom_get4x4sse_cs neon msa/; -# -# Variance / Subpixel Variance / Subpixel Avg Variance -# + # + # Variance / Subpixel Variance / Subpixel Avg Variance + # add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; -} + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + } -specialize qw/aom_variance64x64 sse2 avx2 neon msa/; -specialize qw/aom_variance64x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x64 sse2 neon msa/; -specialize qw/aom_variance32x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x16 sse2 avx2 msa/; -specialize qw/aom_variance16x32 sse2 msa/; -specialize qw/aom_variance16x16 sse2 avx2 neon msa/; -specialize qw/aom_variance16x8 sse2 neon msa/; -specialize qw/aom_variance8x16 sse2 neon msa/; -specialize qw/aom_variance8x8 sse2 neon msa/; -specialize qw/aom_variance8x4 sse2 msa/; -specialize qw/aom_variance4x8 sse2 msa/; -specialize qw/aom_variance4x4 sse2 msa/; - -specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; - -specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - -if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { - specialize qw/aom_variance4x16 sse2/; - specialize qw/aom_variance16x4 sse2/; - specialize qw/aom_variance8x32 sse2/; - specialize qw/aom_variance32x8 sse2/; - specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; -} + specialize qw/aom_variance64x64 sse2 avx2 neon msa/; + specialize qw/aom_variance64x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x64 sse2 neon msa/; + specialize qw/aom_variance32x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x16 sse2 avx2 msa/; + specialize qw/aom_variance16x32 sse2 msa/; + specialize qw/aom_variance16x16 sse2 avx2 neon msa/; + specialize qw/aom_variance16x8 sse2 neon msa/; + specialize qw/aom_variance8x16 sse2 neon msa/; + specialize qw/aom_variance8x8 sse2 neon msa/; + specialize qw/aom_variance8x4 sse2 msa/; + specialize qw/aom_variance4x8 sse2 msa/; + specialize qw/aom_variance4x4 sse2 msa/; + + specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd (8, 10, 12) { - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + + if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") { + specialize qw/aom_variance4x16 sse2/; + specialize qw/aom_variance16x4 sse2/; + specialize qw/aom_variance8x32 sse2/; + specialize qw/aom_variance32x8 sse2/; + specialize qw/aom_variance16x64 sse2/; + specialize qw/aom_variance64x16 sse2/; + specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; + } - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; - } - # TODO(david.barker): When ext-partition-types is enabled, we currenly - # don't have vectorized 4x16 highbd variance functions - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; - } - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; - } - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; + } + # TODO(david.barker): When ext-partition-types is enabled, we currently + # don't have vectorized 4x16 highbd variance functions + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; + } + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; + } + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + } } } - } -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH -if (aom_config("CONFIG_EXT_INTER") eq "yes") { -# -# Masked Variance / Masked Subpixel Variance -# + # + # Masked Variance / Masked Subpixel Variance + # foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; @@ -1056,453 +1198,450 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { } } } -} -# -# OBMC Variance / OBMC Subpixel Variance -# -if (aom_config("CONFIG_MOTION_VAR") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; - } + # + # OBMC Variance / OBMC Subpixel Variance + # + if (aom_config("CONFIG_MOTION_VAR") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd ("_", "_10_", "_12_") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_", "_10_", "_12_") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; + } } } } -} -add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; -# -# Specialty Subpixel -# -add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + # + # Specialty Subpixel + # + add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_h sse2/; -add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_v sse2/; -add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_hv sse2/; -# -# Comp Avg -# -add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x64 sse2/; + # + # Comp Avg + # + add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x64 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x64 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse8x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse8x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse8x8 sse2/; - add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - # - # Subpixel Variance - # - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; + # + # Subpixel Variance + # + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH -if (aom_config("CONFIG_EXT_INTER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; } -} } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 2dc5b2e56..7d5f64004 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -529,229 +529,4 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } - -void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } -} - -void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - } -} - -void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } - } -} - -void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - } - } -} #endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm index 7d04d3553..fba9c1b5b 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm +++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm @@ -19,10 +19,6 @@ EXPORT |aom_h_predictor_8x8_neon| EXPORT |aom_h_predictor_16x16_neon| EXPORT |aom_h_predictor_32x32_neon| - EXPORT |aom_tm_predictor_4x4_neon| - EXPORT |aom_tm_predictor_8x8_neon| - EXPORT |aom_tm_predictor_16x16_neon| - EXPORT |aom_tm_predictor_32x32_neon| ARM REQUIRE8 PRESERVE8 @@ -289,345 +285,3 @@ loop_h bgt loop_h bx lr ENDP ; |aom_h_predictor_32x32_neon| - -;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_4x4_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.u8 {d0[]}, [r12] - - ; Load above 4 pixels - vld1.32 {d2[0]}, [r2] - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3]! - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - - ; 3rd row and 4th row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3] - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - bx lr - ENDP ; |aom_tm_predictor_4x4_neon| - -;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_8x8_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; preload 8 left - vld1.8 {d30}, [r3] - - ; Load above 8 pixels - vld1.64 {d2}, [r2] - - vmovl.u8 q10, d30 - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vdup.16 q0, d20[0] - vdup.16 q1, d20[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 3rd row and 4th row - vdup.16 q8, d20[2] - vdup.16 q9, d20[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - ; 5th row and 6th row - vdup.16 q0, d21[0] - vdup.16 q1, d21[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 7th row and 8th row - vdup.16 q8, d21[2] - vdup.16 q9, d21[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - bx lr - ENDP ; |aom_tm_predictor_8x8_neon| - -;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_16x16_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 8 pixels - vld1.8 {q1}, [r2] - - ; preload 8 left into r12 - vld1.8 {d18}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d0 - - vmovl.u8 q10, d18 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 2 times to process 16 rows. - mov r2, #2 - -loop_16x16_neon - ; Process two rows. - vdup.16 q0, d20[0] - vdup.16 q8, d20[1] - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d20[2] ; proload next 2 rows data - vdup.16 q8, d20[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - ; Process two rows. - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[0] ; proload next 2 rows data - vdup.16 q8, d21[1] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[2] ; proload next 2 rows data - vdup.16 q8, d21[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vld1.8 {d18}, [r3]! ; preload 8 left into r12 - vmovl.u8 q10, d18 - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - subs r2, r2, #1 - bgt loop_16x16_neon - - bx lr - ENDP ; |aom_tm_predictor_16x16_neon| - -;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_32x32_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 32 pixels - vld1.8 {q1}, [r2]! - vld1.8 {q2}, [r2] - - ; preload 8 left pixels - vld1.8 {d26}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q8, d2, d0 - vsubl.u8 q9, d3, d0 - vsubl.u8 q10, d4, d0 - vsubl.u8 q11, d5, d0 - - vmovl.u8 q3, d26 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 4 times to process 32 rows. - mov r2, #4 - -loop_32x32_neon - ; Process two rows. - vdup.16 q0, d6[0] - vdup.16 q2, d6[1] - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q1, d6[2] - vdup.16 q2, d6[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q1, q8 - vadd.s16 q13, q1, q9 - vadd.s16 q14, q1, q10 - vadd.s16 q15, q1, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[0] - vdup.16 q2, d7[1] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[2] - vdup.16 q2, d7[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vld1.8 {d0}, [r3]! ; preload 8 left pixels - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vmovl.u8 q3, d0 - vst1.64 {d24-d27}, [r0], r1 - - subs r2, r2, #1 - bgt loop_32x32_neon - - bx lr - ENDP ; |aom_tm_predictor_32x32_neon| - - END diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index bf304dada..4f38afbc5 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -53,6 +53,15 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r, return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); } +static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, + uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); +} + uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, uint16_t ref ACCT_STR_PARAM) { if (n <= 1) return 0; @@ -101,15 +110,42 @@ uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, return v; } -// Decode finite subexponential code that for a symbol v in [0, n-1] with -// parameter k -// based on a reference ref also in [0, n-1]. +static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, + uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + uint16_t v; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + v = aom_rb_read_primitive_quniform(rb, n - mk) + mk; + break; + } else { + if (aom_rb_read_bit(rb)) { + i = i + 1; + mk += a; + } else { + v = aom_rb_read_literal(rb, b) + mk; + break; + } + } + } + return v; +} + uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, uint16_t ref ACCT_STR_PARAM) { return inv_recenter_finite_nonneg( n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); } +static uint16_t aom_rb_read_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_rb_read_primitive_subexpfin(rb, n, k)); +} + // Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with // parameter k based on a reference ref also in [-(n-1), n-1]. int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, @@ -120,3 +156,10 @@ int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) - n + 1; } + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 1540cf46b..8885142c9 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -17,9 +17,11 @@ extern "C" { #endif #include <assert.h> + #include "./aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" #define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \ aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) @@ -47,6 +49,9 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t ref ACCT_STR_PARAM); + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c index 91e807b29..e092b6278 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -10,6 +10,7 @@ */ #include "aom_dsp/bitwriter.h" +#include "aom_dsp/binary_codes_writer.h" #include "av1/common/common.h" @@ -68,6 +69,19 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { } } +static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_bit(wb, (v - m) & 1); + } +} + int aom_count_primitive_quniform(uint16_t n, uint16_t v) { if (n <= 1) return 0; const int l = get_msb(n - 1) + 1; @@ -155,6 +169,31 @@ void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, } } +static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_wb_write_primitive_quniform(wb, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_wb_write_bit(wb, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_wb_write_literal(wb, v - mk, b); + break; + } + } + } +} + int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { int count = 0; int i = 0; @@ -184,19 +223,34 @@ int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { // based on a reference ref also in [0, n-1]. // Recenters symbol around r first and then uses a finite subexponential code. void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, - int16_t ref, int16_t v) { + uint16_t ref, uint16_t v) { aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); } +static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); +} + void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, - uint16_t k, uint16_t ref, - uint16_t v) { + uint16_t k, int16_t ref, + int16_t v) { ref += n - 1; v += n - 1; const uint16_t scaled_n = (n << 1) - 1; aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); } +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); +} + int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v) { return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h index ab5ccbf15..18ad5078f 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -20,6 +20,7 @@ extern "C" { #include "./aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitwriter.h" +#include "aom_dsp/bitwriter_buffer.h" // Codes a symbol v in [-2^mag_bits, 2^mag_bits] // mag_bits is number of bits for magnitude. The alphabet is of size @@ -53,6 +54,10 @@ void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, int16_t ref, int16_t v); +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v); + // Functions that counts bits for the above primitives int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); int aom_count_primitive_quniform(uint16_t n, uint16_t v); diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 88bedccc2..00424fa76 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -50,6 +50,11 @@ #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#if CONFIG_LV_MAP +#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#endif + #ifdef __cplusplus extern "C" { #endif @@ -198,6 +203,16 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, return ret; } +#if CONFIG_LV_MAP +static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); + update_cdf(cdf, ret, nsymbs); + return ret; +} +#endif + static INLINE int aom_read_tree_as_cdf(aom_reader *r, const aom_tree_index *tree, const aom_prob *probs) { diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index 68bc1c8f8..7d3b34306 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -62,9 +62,8 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) { static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { #if CONFIG_ANS - (void)bc; - (void)buffer; - assert(0 && "buf_ans requires a more complicated startup procedure"); + aom_buf_ans_alloc(bc, /* error context*/ NULL); + buf_ans_write_init(bc, buffer); #else aom_daala_start_encode(bc, buffer); #endif @@ -72,8 +71,8 @@ static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { static INLINE void aom_stop_encode(aom_writer *bc) { #if CONFIG_ANS - (void)bc; - assert(0 && "buf_ans requires a more complicated shutdown procedure"); + aom_buf_ans_flush(bc); + bc->pos = buf_ans_write_end(bc); #else aom_daala_stop_encode(bc); #endif @@ -143,6 +142,14 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, update_cdf(cdf, symb, nsymbs); } +#if CONFIG_LV_MAP +static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); + update_cdf(cdf, symb, nsymbs); +} +#endif + static INLINE void aom_write_tree_as_cdf(aom_writer *w, const aom_tree_index *tree, const aom_prob *probs, int bits, diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c index 8fe1ff763..f7703dffc 100644 --- a/third_party/aom/aom_dsp/buf_ans.c +++ b/third_party/aom/aom_dsp/buf_ans.c @@ -16,9 +16,8 @@ #include "aom/internal/aom_codec_internal.h" void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error, int size) { + struct aom_internal_error_info *error) { c->error = error; - c->size = size; assert(c->size > 1); AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf))); // Initialize to overfull to trigger the assert in write. diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h index 0768506b3..f84ff3aed 100644 --- a/third_party/aom/aom_dsp/buf_ans.h +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -46,6 +46,7 @@ struct BufAnsCoder { #if ANS_MAX_SYMBOLS int window_size; #endif + int pos; // Dummy variable to store the output buffer after closing }; // Allocate a buffered ANS coder to store size symbols. @@ -54,7 +55,7 @@ struct BufAnsCoder { // When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the // buffer will grow on demand void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error, int hint); + struct aom_internal_error_info *error); void aom_buf_ans_free(struct BufAnsCoder *c); diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c index 0fc7b14a5..c6e3ac82d 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -17,7 +17,7 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { } r->buffer_end = buffer + size; r->buffer = buffer; - od_ec_dec_init(&r->ec, buffer, size - 1); + od_ec_dec_init(&r->ec, buffer, size); #if CONFIG_ACCOUNTING r->accounting = NULL; #endif diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h index 428d74db0..55ff8d3d5 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -45,11 +45,7 @@ uint32_t aom_daala_reader_tell_frac(const daala_reader *r); static INLINE int aom_daala_read(daala_reader *r, int prob) { int bit; -#if CONFIG_EC_SMALLMUL int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#else - int p = ((prob << 15) + 256 - prob) >> 8; -#endif #if CONFIG_BITSTREAM_DEBUG /*{ const int queue_r = bitstream_queue_get_read(); @@ -113,6 +109,7 @@ static INLINE int aom_daala_reader_has_error(daala_reader *r) { static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, int nsymbs) { int symb; + assert(cdf != NULL); symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); #if CONFIG_BITSTREAM_DEBUG diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c index 0ba8f6ab8..59af2a243 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.c +++ b/third_party/aom/aom_dsp/daalaboolwriter.c @@ -24,9 +24,5 @@ void aom_daala_stop_encode(daala_writer *br) { daala_data = od_ec_enc_done(&br->ec, &daala_bytes); memcpy(br->buffer, daala_data, daala_bytes); br->pos = daala_bytes; - /* Prevent ec bitstream from being detected as a superframe marker. - Must always be added, so that rawbits knows the exact length of the - bitstream. */ - br->buffer[br->pos++] = 0; od_ec_enc_clear(&br->ec); } diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h index bbaf53c69..6ec0f0b54 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -36,11 +36,7 @@ void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); void aom_daala_stop_encode(daala_writer *w); static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { -#if CONFIG_EC_SMALLMUL int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#else - int p = ((prob << 15) + 256 - prob) >> 8; -#endif #if CONFIG_BITSTREAM_DEBUG aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; /*int queue_r = 0; diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h index 534959e66..981a951e6 100644 --- a/third_party/aom/aom_dsp/entcode.h +++ b/third_party/aom/aom_dsp/entcode.h @@ -28,15 +28,11 @@ typedef uint32_t od_ec_window; 3 => 1/8th bits.*/ #define OD_BITRES (3) -/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual - Q15 cumulative probability (an "inverse" CDF). +/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative + probability (an "inverse" CDF). This function converts from one representation to the other (and is its own inverse).*/ -#if CONFIG_EC_SMALLMUL #define OD_ICDF(x) (32768U - (x)) -#else -#define OD_ICDF(x) (x) -#endif /*See entcode.c for further documentation.*/ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c index 49b176cd8..71dad0df6 100644 --- a/third_party/aom/aom_dsp/entdec.c +++ b/third_party/aom/aom_dsp/entdec.c @@ -114,12 +114,8 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, OD_ASSERT(rng <= 65535U); d = 16 - OD_ILOG_NZ(rng); dec->cnt -= d; -#if CONFIG_EC_SMALLMUL /*This is equivalent to shifting in 1's instead of 0's.*/ dec->dif = ((dif + 1) << d) - 1; -#else - dec->dif = dif << d; -#endif dec->rng = rng << d; if (dec->cnt < 0) od_ec_dec_refill(dec); return ret; @@ -137,11 +133,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); dec->end = buf + storage; dec->bptr = buf; -#if CONFIG_EC_SMALLMUL dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; -#else - dec->dif = 0; -#endif dec->rng = 0x8000; dec->cnt = -15; dec->error = 0; @@ -149,8 +141,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, } /*Decode a single binary value. - {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768. - {else} f: The probability that the bit is zero, scaled by 32768. + f: The probability that the bit is one, scaled by 32768. Return: The value decoded (0 or 1).*/ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { od_ec_window dif; @@ -165,7 +156,6 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { r = dec->rng; OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL v = (r >> 8) * (uint32_t)f >> 7; vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); ret = 1; @@ -175,30 +165,19 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { dif -= vw; ret = 0; } -#else - v = f * (uint32_t)r >> 15; - vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); - ret = 0; - r_new = v; - if (dif >= vw) { - r_new = r - v; - dif -= vw; - ret = 1; - } -#endif return od_ec_dec_normalize(dec, dif, r_new, ret); } -/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15. - cdf: The CDF, such that symbol s falls in the range - [s > 0 ? cdf[s - 1] : 0, cdf[s]). - The values must be monotonically non-increasing, and cdf[nsyms - 1] - must be 32768. - {EC_SMALLMUL}: The CDF contains 32768 minus those values. +/*Decodes a symbol given an inverse cumulative distribution function (CDF) + table in Q15. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically non-increasing, and icdf[nsyms - 1] + must be 0. nsyms: The number of symbols in the alphabet. This should be at most 16. Return: The decoded symbol s.*/ -int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { od_ec_window dif; unsigned r; unsigned c; @@ -209,33 +188,19 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { dif = dec->dif; r = dec->rng; OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); v = r; ret = -1; do { u = v; - v = (r >> 8) * (uint32_t)cdf[++ret] >> 7; + v = (r >> 8) * (uint32_t)icdf[++ret] >> 7; } while (c < v); OD_ASSERT(v < u); OD_ASSERT(u <= r); r = u - v; dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); -#else - c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); - v = 0; - ret = -1; - do { - u = v; - v = cdf[++ret] * (uint32_t)r >> 15; - } while (v <= c); - OD_ASSERT(u < v); - OD_ASSERT(v <= r); - r = v - u; - dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16); -#endif return od_ec_dec_normalize(dec, dif, r, ret); } diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h index e1145e81d..35ac7fe0d 100644 --- a/third_party/aom/aom_dsp/entdec.h +++ b/third_party/aom/aom_dsp/entdec.h @@ -47,10 +47,8 @@ struct od_ec_dec { const unsigned char *end; /*The read pointer for the entropy-coded bits.*/ const unsigned char *bptr; - /*The difference between the coded value and the low end of the current - range. - {EC_SMALLMUL} The difference between the high end of the current range, - (low + rng), and the coded value, minus 1. + /*The difference between the high end of the current range, (low + rng), and + the coded value, minus 1. This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the decoder only uses the top 16 bits of the window to decode the next symbol. As we shift up during renormalization, if we don't have enough bits left in diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c index a350f27f4..b8c4dc047 100644 --- a/third_party/aom/aom_dsp/entenc.c +++ b/third_party/aom/aom_dsp/entenc.c @@ -143,11 +143,10 @@ void od_ec_enc_clear(od_ec_enc *enc) { } /*Encodes a symbol given its frequency in Q15. - fl: The cumulative frequency of all symbols that come before the one to be - encoded. - fh: The cumulative frequency of all symbols up to and including the one to - be encoded. - {EC_SMALLMUL} Both values are 32768 minus that.*/ + fl: 32768 minus the cumulative frequency of all symbols that come before the + one to be encoded. + fh: 32768 minus the cumulative frequency of all symbols up to and including + the one to be encoded.*/ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { od_ec_window l; unsigned r; @@ -156,7 +155,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { l = enc->low; r = enc->rng; OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL OD_ASSERT(fh < fl); OD_ASSERT(fl <= 32768U); if (fl < 32768U) { @@ -167,14 +165,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { } else { r -= (r >> 8) * (uint32_t)fh >> 7; } -#else - OD_ASSERT(fl < fh); - OD_ASSERT(fh <= 32768U); - u = fl * (uint32_t)r >> 15; - v = fh * (uint32_t)r >> 15; - r = v - u; - l += u; -#endif od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.); @@ -184,8 +174,7 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { /*Encode a single binary value. val: The value to encode (0 or 1). - {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768. - {else} f: The probability that val is zero, scaled by 32768.*/ + f: The probability that the val is one, scaled by 32768.*/ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { od_ec_window l; unsigned r; @@ -195,15 +184,9 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { l = enc->low; r = enc->rng; OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL v = (r >> 8) * (uint32_t)f >> 7; if (val) l += r - v; r = val ? v : r - v; -#else - v = f * (uint32_t)r >> 15; - if (val) l += v; - r = val ? r - v : v; -#endif od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= @@ -214,19 +197,19 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { /*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. s: The index of the symbol to encode. - cdf: The CDF, such that symbol s falls in the range - [s > 0 ? cdf[s - 1] : 0, cdf[s]). - The values must be monotonically non-decreasing, and the last value - must be exactly 32768. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically decreasing, and icdf[nsyms - 1] must + be 0. nsyms: The number of symbols in the alphabet. This should be at most 16.*/ -void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, int nsyms) { (void)nsyms; OD_ASSERT(s >= 0); OD_ASSERT(s < nsyms); - OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); - od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]); + OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); + od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]); } #if CONFIG_RAWBITS diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c index b4d47ae89..6d2ac37d9 100644 --- a/third_party/aom/aom_dsp/intrapred.c +++ b/third_party/aom/aom_dsp/intrapred.c @@ -16,6 +16,7 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/intrapred_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" @@ -179,7 +180,6 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, } } -#if CONFIG_ALT_INTRA static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, @@ -208,40 +208,6 @@ static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, } } -// Weights are quadratic from '1' to '1 / block_size', scaled by -// 2^sm_weight_log2_scale. -static const int sm_weight_log2_scale = 8; - -#if CONFIG_TX64X64 -// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) -#define MAX_BLOCK_DIM 64 -#else -#define MAX_BLOCK_DIM 32 -#endif // CONFIG_TX64X64 - -static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { - // Unused, because we always offset by bs, which is at least 2. - 0, 0, - // bs = 2 - 255, 128, - // bs = 4 - 255, 149, 85, 64, - // bs = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // bs = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // bs = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, -#if CONFIG_TX64X64 - // bs = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, - 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, - 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, -#endif // CONFIG_TX64X64 -}; - // Some basic checks on weights for smooth predictor. #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ pred_scale) \ @@ -344,21 +310,6 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, } #endif // CONFIG_SMOOTH_HV -#else - -static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left) { - int r, c; - int ytop_left = above[-1]; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = clip_pixel(left[r] + above[c] - ytop_left); - dst += stride; - } -} -#endif // CONFIG_ALT_INTRA - static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { @@ -794,7 +745,6 @@ void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, DST(1, 1) = AVG3(J, I, X); } -#if CONFIG_ALT_INTRA static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { @@ -901,23 +851,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, dst += stride; } } -#endif - -#else -static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - int ytop_left = above[-1]; - (void)bd; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); - dst += stride; - } -} -#endif // CONFIG_ALT_INTRA +#endif // CONFIG_SMOOTH_HV static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, @@ -1017,12 +951,16 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) \ intra_pred_highbd_sized(type, 4, 8) \ intra_pred_highbd_sized(type, 8, 4) \ intra_pred_highbd_sized(type, 8, 16) \ intra_pred_highbd_sized(type, 16, 8) \ intra_pred_highbd_sized(type, 16, 32) \ - intra_pred_highbd_sized(type, 32, 16) + intra_pred_highbd_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 32, 64) \ + intra_pred_highbd_sized(type, 64, 32) #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ @@ -1078,7 +1016,9 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_sized(type, 8, 16) \ intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) + intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ @@ -1118,16 +1058,12 @@ intra_pred_above_4x4(d135) intra_pred_above_4x4(d153) intra_pred_allsizes(v) intra_pred_allsizes(h) -#if CONFIG_ALT_INTRA intra_pred_allsizes(smooth) #if CONFIG_SMOOTH_HV intra_pred_allsizes(smooth_v) intra_pred_allsizes(smooth_h) #endif // CONFIG_SMOOTH_HV intra_pred_allsizes(paeth) -#else -intra_pred_allsizes(tm) -#endif // CONFIG_ALT_INTRA intra_pred_allsizes(dc_128) intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_top) diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h new file mode 100644 index 000000000..96da49b03 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_INTRAPRED_COMMON_H +#define _AOM_DSP_INTRAPRED_COMMON_H + +#include "./aom_config.h" + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^sm_weight_log2_scale. +static const int sm_weight_log2_scale = 8; + +#if CONFIG_TX64X64 +// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) +#define MAX_BLOCK_DIM 64 +#else +#define MAX_BLOCK_DIM 32 +#endif // CONFIG_TX64X64 + +static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { + // Unused, because we always offset by bs, which is at least 2. + 0, 0, + // bs = 2 + 255, 128, + // bs = 4 + 255, 149, 85, 64, + // bs = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // bs = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // bs = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, +#if CONFIG_TX64X64 + // bs = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, +#endif // CONFIG_TX64X64 +}; + +#endif // _AOM_DSP_INTRAPRED_COMMON_H diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c index 398eb0a12..6b7c1c2ab 100644 --- a/third_party/aom/aom_dsp/inv_txfm.c +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -14,7 +14,8 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/inv_txfm.h" -#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 +#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ + CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 #include "av1/common/daala_tx.h" #endif @@ -96,18 +97,6 @@ void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { } } -#if CONFIG_DAALA_DCT4 -void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[4]; - od_coeff y[4]; - for (i = 0; i < 4; i++) y[i] = input[i]; - od_bin_idct4(x, 1, y); - for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step[4]; tran_high_t temp1, temp2; @@ -127,7 +116,6 @@ void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { output[2] = WRAPLOW(step[1] - step[2]); output[3] = WRAPLOW(step[0] - step[3]); } -#endif void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[4 * 4]; @@ -172,18 +160,6 @@ void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, } } -#if CONFIG_DAALA_DCT8 -void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[8]; - od_coeff y[8]; - for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; - od_bin_idct8(x, 1, y); - for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; @@ -237,7 +213,6 @@ void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { output[6] = WRAPLOW(step1[1] - step1[6]); output[7] = WRAPLOW(step1[0] - step1[7]); } -#endif void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8]; @@ -313,18 +288,6 @@ void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } -#if CONFIG_DAALA_DCT8 -void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[8]; - od_coeff y[8]; - for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; - od_bin_idst8(x, 1, y); - for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -402,8 +365,6 @@ void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { output[7] = WRAPLOW(-x1); } -#endif - void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; @@ -1224,7 +1185,7 @@ void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { #if CONFIG_MRC_TX void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, int *mask) { + int stride, uint8_t *mask) { tran_low_t out[32 * 32]; tran_low_t *outptr = out; int i, j; @@ -1265,7 +1226,7 @@ void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, } void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask) { + uint8_t *mask) { tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; @@ -1295,7 +1256,7 @@ void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, } void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask) { + uint8_t *mask) { tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h index a9c485e74..644a6599f 100644 --- a/third_party/aom/aom_dsp/inv_txfm.h +++ b/third_party/aom/aom_dsp/inv_txfm.h @@ -55,19 +55,22 @@ static INLINE tran_high_t check_range(tran_high_t input, int bd) { #if CONFIG_MRC_TX // These each perform dct but add coefficients based on a mask void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, int *mask); + int stride, uint8_t *mask); void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask); + uint8_t *mask); void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask); + uint8_t *mask); #endif // CONFIG_MRC_TX void aom_idct4_c(const tran_low_t *input, tran_low_t *output); void aom_idct8_c(const tran_low_t *input, tran_low_t *output); void aom_idct16_c(const tran_low_t *input, tran_low_t *output); void aom_idct32_c(const tran_low_t *input, tran_low_t *output); +#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 +void aom_idct64_c(const tran_low_t *input, tran_low_t *output); +#endif void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c index 7ea1e6b89..69f131378 100644 --- a/third_party/aom/aom_dsp/loopfilter.c +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -23,6 +23,14 @@ static INLINE int8_t signed_char_clamp(int t) { #define PARALLEL_DEBLOCKING_11_TAP 0 #define PARALLEL_DEBLOCKING_9_TAP 0 +#if CONFIG_DEBLOCK_13TAP +#define PARALLEL_DEBLOCKING_13_TAP 1 +#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1 +#else +#define PARALLEL_DEBLOCKING_13_TAP 0 +#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0 +#endif + #if CONFIG_HIGHBITDEPTH static INLINE int16_t signed_char_clamp_high(int t, int bd) { switch (bd) { @@ -58,6 +66,19 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, return ~mask; } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + return ~mask; +} +#endif + static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { @@ -216,6 +237,25 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { + if (flat && mask) { + const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} +#endif + static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, @@ -236,6 +276,32 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, } } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p); + ++s; + } +} +#endif + void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; @@ -268,6 +334,28 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); + s += pitch; + } +} +#endif + void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; @@ -297,6 +385,56 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_13_TAP +static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { + if (flat2 && flat && mask) { + const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, + p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + #if PARALLEL_DEBLOCKING_11_TAP static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t *op5, uint8_t *op4, @@ -428,7 +566,16 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); -#if PARALLEL_DEBLOCKING_11_TAP +#if PARALLEL_DEBLOCKING_13_TAP + (void)p7; + (void)q7; + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); + +#elif PARALLEL_DEBLOCKING_11_TAP const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p, @@ -482,7 +629,14 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); -#if PARALLEL_DEBLOCKING_11_TAP +#if PARALLEL_DEBLOCKING_13_TAP + (void)p7; + (void)q7; + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, + s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); +#elif PARALLEL_DEBLOCKING_11_TAP const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2, @@ -553,6 +707,21 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, return ~mask; } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + return ~mask; +} +#endif + static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, @@ -708,6 +877,26 @@ void aom_highbd_lpf_vertical_4_dual_c( bd); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + int bd) { + if (flat && mask) { + const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} +#endif + static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, @@ -754,6 +943,33 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, } } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, bd); + ++s; + } +} +#endif + void aom_highbd_lpf_horizontal_8_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, @@ -762,6 +978,30 @@ void aom_highbd_lpf_horizontal_8_dual_c( aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, + bd); + s += pitch; + } +} +#endif + void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { @@ -794,6 +1034,68 @@ void aom_highbd_lpf_vertical_8_dual_c( bd); } +#if PARALLEL_DEBLOCKING_13_TAP +static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op6, uint16_t *op5, + uint16_t *op4, uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, uint16_t *oq0, + uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, + uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, + int bd) { + if (flat2 && flat && mask) { + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} +#endif + static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, @@ -887,6 +1189,16 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + +#if PARALLEL_DEBLOCKING_13_TAP + const int8_t flat2 = + highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], + s[5 * p], s[6 * p], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); +#else const int8_t flat2 = highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); @@ -895,6 +1207,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p, bd); +#endif ++s; } } @@ -937,12 +1250,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#if PARALLEL_DEBLOCKING_13_TAP + const int8_t flat2 = + highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, + s + 6, bd); +#else const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7], bd); highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, bd); +#endif s += p; } } diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c index 298065adb..3574da19f 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c @@ -407,6 +407,11 @@ void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint32_t tp1, tp2, tn1; uint32_t tp3, tp4, tn2; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + /* prefetch data to cache memory */ prefetch_load(src); prefetch_load(src + 32); diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c index c871702f4..dd4bc821a 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -1304,6 +1304,8 @@ void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; + (void)x_step_q4; + assert(x_step_q4 == 16); assert(y_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); @@ -1400,6 +1402,11 @@ void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int x, y; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + /* prefetch data to cache memory */ prefetch_load(src); prefetch_load(src + 32); diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c index dc8f20208..7c221ae89 100644 --- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c @@ -17,6 +17,8 @@ void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + (void)above; + __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t" diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c index ea7c02810..0a21979c7 100644 --- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c @@ -15,6 +15,7 @@ void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" @@ -78,148 +79,4 @@ void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } - -void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t left0, left1, left2, left3; - int32_t res0, res1; - int32_t resl; - int32_t resr; - int32_t top_left; - uint8_t *cm = aom_ff_cropTbl; - - __asm__ __volatile__( - "ulw %[resl], (%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - "lbu %[left1], 1(%[left]) \n\t" - "lbu %[left2], 2(%[left]) \n\t" - "lbu %[left3], 3(%[left]) \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - - "preceu.ph.qbl %[abovel], %[resl] \n\t" - "preceu.ph.qbr %[abover], %[resl] \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "replv.ph %[left1], %[left1] \n\t" - "replv.ph %[left2], %[left2] \n\t" - "replv.ph %[left3], %[left3] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[resl], %[abovel], %[left0] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left0] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left1] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left1] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left2] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left2] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left3] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left3] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), - [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), - [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), - [resr] "=&r"(resr), [top_left] "=&r"(top_left) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride), [cm] "r"(cm)); -} #endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c index 1114fbc00..d42a77c80 100644 --- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c @@ -15,6 +15,7 @@ void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" @@ -146,458 +147,4 @@ void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } - -void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t abovel_1, abover_1; - int32_t left0; - int32_t res0, res1, res2, res3; - int32_t reshw; - int32_t top_left; - uint8_t *cm = aom_ff_cropTbl; - - __asm__ __volatile__( - "ulw %[reshw], (%[above]) \n\t" - "ulw %[top_left], 4(%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - - "preceu.ph.qbl %[abovel], %[reshw] \n\t" - "preceu.ph.qbr %[abover], %[reshw] \n\t" - "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" - "preceu.ph.qbr %[abover_1], %[top_left] \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - "replv.ph %[left0], %[left0] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 1(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 2(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 3(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 4(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 5(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 6(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 7(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - : [abovel] "=&r"(abovel), [abover] "=&r"(abover), - [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), - [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), - [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), - [top_left] "=&r"(top_left) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride), [cm] "r"(cm)); -} #endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c index e8eaec7a9..bcb9c9df9 100644 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -382,176 +382,6 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { } } -static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t val; - uint8_t top_left = src_top_ptr[-1]; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v16u8 src0, src1, src2, src3; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - - src_top_left = (v8u16)__msa_fill_h(top_left); - val = LW(src_top_ptr); - src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); - - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); -} - -static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint64_t val; - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - v16u8 src0, src1, src2, src3; - - val = LD(src_top_ptr); - src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 2; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_top, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r, res_l; - - src_top = LD_SB(src_top_ptr); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 4; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVRL_B2_UH(src_left0, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left1, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left2, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left3, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - } -} - -static void intra_predict_tm_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint8_t top_left = src_top[-1]; - uint32_t loop_cnt; - v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; - - LD_SB2(src_top, 16, src_top0, src_top1); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 8; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - } -} - void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left) { (void)left; @@ -717,23 +547,3 @@ void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, intra_predict_128dc_32x32_msa(dst, y_stride); } - -void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_4x4_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_8x8_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_16x16_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_32x32_msa(above, left, dst, y_stride); -} diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h index 8a85e26f3..c69835173 100644 --- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h +++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h @@ -24,10 +24,12 @@ extern "C" { #endif #if HAVE_DSPR2 +/* Note: this macro expects a local int32_t named out to exist, and will write + * to that variable. */ #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ ({ \ \ - int32_t tmp, out; \ + int32_t tmp; \ int dct_cost_rounding = DCT_CONST_ROUNDING; \ int in = input; \ \ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index 35db134e5..a517e810a 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -46,6 +46,14 @@ typedef uint16_t aom_cdf_prob; #define MAX_PROB 255 +#define LV_MAP_PROB 1 + +#define BR_NODE 1 + +#if CONFIG_ADAPT_SCAN +#define CACHE_SCAN_PROB 1 +#endif + #define aom_prob_half ((aom_prob)128) typedef int8_t aom_tree_index; @@ -149,7 +157,11 @@ static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { - const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); + int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); +#if CONFIG_LV_MAP + if (nsymbs == 2) + rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs); +#endif const int rate2 = 5; int i, tmp; int diff; @@ -158,7 +170,7 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { tmp = AOM_ICDF(tmp0); diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; // Single loop (faster) -#if !CONFIG_ANS && CONFIG_EC_SMALLMUL +#if !CONFIG_ANS for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { tmp -= (i == val ? diff : 0); cdf[i] += ((tmp - cdf[i]) >> rate); @@ -183,6 +195,12 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { cdf[nsymbs] += (cdf[nsymbs] < 32); } +#if CONFIG_LV_MAP +static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) { + update_cdf(cdf, val, nsymbs); +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c index 461c13729..d543f12d1 100644 --- a/third_party/aom/aom_dsp/psnr.c +++ b/third_party/aom/aom_dsp/psnr.c @@ -289,6 +289,27 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, } #endif // CONFIG_HIGHBITDEPTH +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd) { +#if CONFIG_HIGHBITDEPTH + if (highbd) { + switch (plane) { + case 0: return aom_highbd_get_y_sse(a, b); + case 1: return aom_highbd_get_u_sse(a, b); + case 2: return aom_highbd_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } +#endif + (void)highbd; + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } +} + #if CONFIG_HIGHBITDEPTH void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, @@ -296,9 +317,7 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; - const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; @@ -313,14 +332,15 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, uint64_t sse; if (a->flags & YV12_FLAG_HIGHBITDEPTH) { if (input_shift) { - sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h, input_shift); } else { - sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); } } else { - sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, + h); } psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; @@ -344,9 +364,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; - const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; @@ -357,7 +375,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int h = heights[i]; const uint32_t samples = w * h; const uint64_t sse = - get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h index 480140e6f..df5f8f9f2 100644 --- a/third_party/aom/aom_dsp/psnr.h +++ b/third_party/aom/aom_dsp/psnr.h @@ -47,6 +47,8 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd); #if CONFIG_HIGHBITDEPTH int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index fe98b6028..21bcc486a 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -12,18 +12,14 @@ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" -static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, -#if CONFIG_AOM_QM - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, -#endif - const int log_scale) { +void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; @@ -37,20 +33,12 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Pre-scan pass for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; -#if CONFIG_AOM_QM - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; -#else - const int coeff = coeff_ptr[rc]; -#endif // CONFIG_AOM_QM -#if CONFIG_AOM_QM - if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && - coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) non_zero_count--; -#else - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--; -#endif // CONFIG_AOM_QM else break; } @@ -64,35 +52,21 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32; -#if CONFIG_AOM_QM - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { -#else - if (abs_coeff >= zbins[rc != 0]) { -#endif // CONFIG_AOM_QM int64_t tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), INT16_MIN, INT16_MAX); -#if CONFIG_AOM_QM tmp *= wt; tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); // quantization -#else - tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - (16 - log_scale)); // quantization -#endif // CONFIG_AOM_QM qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; -#if CONFIG_AOM_QM + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); -#else - dqcoeff_ptr[rc] = - qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale); -#endif // CONFIG_AOM_QM if (tmp32) eob = i; } @@ -101,324 +75,25 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 0); -} - -void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 1); -} - -#if CONFIG_TX64X64 -void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 2); -} -#endif // CONFIG_TX64X64 - -#if CONFIG_AOM_QM -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), - INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - int eob = -1; - int dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[0]; - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - int eob = -1; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - int eob = -1; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - int dequant; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff = coeff_ptr[rc] * wt; - - if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && - coeff > (nzbins[rc != 0] << AOM_QM_BITS)) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; - if (abs_qcoeff) eob = i; - } - } - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_b_32x32_c( +void highbd_quantize_b_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[1024]; + const qm_val_t *iqm_ptr, const int log_scale) { int i, eob = -1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int dequant; +#if CONFIG_TX64X64 + int idx_arr[4096]; +#else + int idx_arr[1024]; +#endif (void)iscan; + int idx = 0; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -427,13 +102,13 @@ void aom_highbd_quantize_b_32x32_c( // Pre-scan pass for (i = 0; i < n_coeffs; i++) { const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; // If the coefficient is out of the base ZBIN range, keep it for // quantization. - if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || - coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) idx_arr[idx++] = i; } @@ -443,134 +118,112 @@ void aom_highbd_quantize_b_32x32_c( const int rc = scan[idx_arr[i]]; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); const int64_t tmpw = tmp1 * wt; const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS)); + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); if (abs_qcoeff) eob = idx_arr[i]; } } *eob_ptr = eob + 1; } -#if CONFIG_TX64X64 -void aom_highbd_quantize_b_64x64_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), - ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[4096]; - int i, eob = -1; +void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp; + int eob = -1; + int32_t tmp32; int dequant; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff = coeff_ptr[rc] * wt; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || - coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr[rc]; - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; - if (abs_qcoeff) eob = idx_arr[i]; - } + const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); + if (tmp32) eob = 0; } *eob_ptr = eob + 1; } -#endif // CONFIG_TX64X64 -#else // CONFIG_AOM_QM +/* These functions should only be called when quantisation matrices + are not used. */ +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); +} + +#if CONFIG_TX64X64 +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); +} +#endif // CONFIG_TX64X64 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, + 0); } void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1); } #if CONFIG_TX64X64 @@ -578,100 +231,8 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int n_coeffs = 4096; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 14; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr) { - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[0]; - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { - const int n_coeffs = 1024; - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { - const int n_coeffs = 4096; - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2); } #endif // CONFIG_TX64X64 @@ -682,45 +243,10 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_c( @@ -729,47 +255,10 @@ void aom_highbd_quantize_b_32x32_c( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); } #if CONFIG_TX64X64 @@ -779,47 +268,9 @@ void aom_highbd_quantize_b_64x64_c( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), - ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[4096]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; - if (abs_qcoeff) eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); } #endif // CONFIG_TX64X64 -#endif // CONFIG_AOM_QM diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h index fe49b830f..03609e8b4 100644 --- a/third_party/aom/aom_dsp/quantize.h +++ b/third_party/aom/aom_dsp/quantize.h @@ -19,32 +19,57 @@ extern "C" { #endif -#if CONFIG_AOM_QM +void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan); + +void highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif + void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); #if CONFIG_TX64X64 void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); #endif // CONFIG_TX64X64 -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr); + +#if CONFIG_AOM_QM #if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, @@ -64,32 +89,10 @@ void aom_highbd_quantize_dc_64x64( const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); #endif // CONFIG_TX64X64 -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); #endif // CONFIG_HIGHBITDEPTH #else // CONFIG_AOM_QM -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#endif // CONFIG_TX64X64 #if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index b9c789ce5..6b8ca669b 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -163,11 +163,19 @@ sadMxN(8, 32) sadMxNx4D(8, 32) sadMxN(32, 8) sadMxNx4D(32, 8) +sadMxN(16, 64) +sadMxNx4D(16, 64) +sadMxN(64, 16) +sadMxNx4D(64, 16) +sadMxN(32, 128) +sadMxNx4D(32, 128) +sadMxN(128, 32) +sadMxNx4D(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; @@ -328,12 +336,20 @@ highbd_sadMxN(8, 32) highbd_sadMxNx4D(8, 32) highbd_sadMxN(32, 8) highbd_sadMxNx4D(32, 8) +highbd_sadMxN(16, 64) +highbd_sadMxNx4D(16, 64) +highbd_sadMxN(64, 16) +highbd_sadMxNx4D(64, 16) +highbd_sadMxN(32, 128) +highbd_sadMxNx4D(32, 128) +highbd_sadMxN(128, 32) +highbd_sadMxNx4D(128, 32) #endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH -#if CONFIG_AV1 && CONFIG_EXT_INTER - static INLINE +#if CONFIG_AV1 + static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, @@ -395,11 +411,15 @@ MASKSADMxN(4, 16) MASKSADMxN(16, 4) MASKSADMxN(8, 32) MASKSADMxN(32, 8) +MASKSADMxN(16, 64) +MASKSADMxN(64, 16) +MASKSADMxN(32, 128) +MASKSADMxN(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, @@ -464,9 +484,13 @@ HIGHBD_MASKSADMXN(4, 16) HIGHBD_MASKSADMXN(16, 4) HIGHBD_MASKSADMXN(8, 32) HIGHBD_MASKSADMXN(32, 8) +HIGHBD_MASKSADMXN(16, 64) +HIGHBD_MASKSADMXN(64, 16) +HIGHBD_MASKSADMXN(32, 128) +HIGHBD_MASKSADMXN(128, 32) #endif #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR // pre: predictor being evaluated @@ -522,11 +546,15 @@ OBMCSADMxN(4, 16) OBMCSADMxN(16, 4) OBMCSADMxN(8, 32) OBMCSADMxN(32, 8) +OBMCSADMxN(16, 64) +OBMCSADMxN(64, 16) +OBMCSADMxN(32, 128) +OBMCSADMxN(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { @@ -578,6 +606,10 @@ HIGHBD_OBMCSADMXN(4, 16) HIGHBD_OBMCSADMXN(16, 4) HIGHBD_OBMCSADMXN(8, 32) HIGHBD_OBMCSADMXN(32, 8) +HIGHBD_OBMCSADMXN(16, 64) +HIGHBD_OBMCSADMXN(64, 16) +HIGHBD_OBMCSADMXN(32, 128) +HIGHBD_OBMCSADMXN(128, 32) #endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c index 141bf01c7..6ae378ff2 100644 --- a/third_party/aom/aom_dsp/ssim.c +++ b/third_party/aom/aom_dsp/ssim.c @@ -168,23 +168,16 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight) { - double a, b, c; - double ssimv; - - a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, - dest->y_stride, source->y_crop_width, source->y_crop_height); - - b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, source->uv_crop_height); - - c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, source->uv_crop_height); - - ssimv = a * .8 + .1 * (b + c); + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], source->crop_heights[is_uv]); + } *weight = 1; - - return ssimv; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); } // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity @@ -433,30 +426,19 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd) { - double a, b, c; - double ssimv; - uint32_t shift = 0; - assert(bd >= in_bd); - shift = bd - in_bd; - - a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, - dest->y_stride, source->y_crop_width, - source->y_crop_height, in_bd, shift); - - b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - ssimv = a * .8 + .1 * (b + c); + const uint32_t shift = bd - in_bd; + + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], + source->crop_heights[is_uv], in_bd, shift); + } *weight = 1; - - return ssimv; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); } - #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index 01732ae64..ef9e9bc98 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -13,6 +13,7 @@ #define AOM_DSP_TXFM_COMMON_H_ #include "aom_dsp/aom_dsp_common.h" +#include "av1/common/enums.h" // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 @@ -23,18 +24,25 @@ typedef struct txfm_param { // for both forward and inverse transforms - int tx_type; - int tx_size; + TX_TYPE tx_type; + TX_SIZE tx_size; int lossless; int bd; #if CONFIG_MRC_TX || CONFIG_LGT + int is_inter; +#endif // CONFIG_MRC_TX || CONFIG_LGT +#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED int stride; uint8_t *dst; -#endif // CONFIG_MRC_TX || CONFIG_LGT -#if CONFIG_LGT - int is_inter; +#if CONFIG_MRC_TX + int *valid_mask; + uint8_t *mask; +#endif // CONFIG_MRC_TX +#if CONFIG_LGT_FROM_PRED int mode; -#endif + int use_lgt; +#endif // CONFIG_LGT_FROM_PRED +#endif // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED // for inverse transforms only #if CONFIG_ADAPT_SCAN const int16_t *eob_threshold; @@ -87,27 +95,608 @@ static const tran_high_t sinpi_4_9 = 15212; // 16384 * sqrt(2) static const tran_high_t Sqrt2 = 23170; +static const tran_high_t InvSqrt2 = 11585; static INLINE tran_high_t fdct_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); return rv; } -#if CONFIG_LGT -// The Line Graph Transforms (LGTs) matrices are written as follows. -// Each 2D array is 16384 times an LGT matrix, which is the matrix of -// eigenvectors of the graph Laplacian matrices for the line graph. +#if CONFIG_LGT_FROM_PRED +// Use negative numbers so they do not coincide with lgt*[0][0], which are +// always nonnegative. +typedef enum { + DCT4 = -1, + ADST4 = -2, + DCT8 = -3, + ADST8 = -4, + DCT16 = -5, + ADST16 = -6, + DCT32 = -7, + ADST32 = -8, +} ButterflyLgt; -// LGT4 name: lgt4_140 -// Self loops: 1.400, 0.000, 0.000, 0.000 +/* These are some LGTs already implementated in the codec. When any of them + * is chosen, the flgt or ilgt function will call the existing fast + * transform instead of the matrix product implementation. Thus, we + * do not need the actual basis functions here */ +static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } }; +static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } }; +static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } }; +static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } }; +static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } }; +static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } }; +static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } }; +static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } }; + +/* The Line Graph Transforms (LGTs) matrices are written as follows. + Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the + matrix of eigenvectors of the graph Laplacian matrix of the associated + line graph. Some of those transforms have fast algorithms but not + implemented yet for now. */ + +// LGT4 name: lgt4_150_000w3 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_150_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 5991, 13537, 17825, 0 }, + { 15515, 10788, -13408, 0 }, + { 16133, -15403, 6275, 0 }, +}; + +// LGT4 name: lgt4_100_000w3 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_100_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 7600, 13694, 17076, 0 }, + { 17076, 7600, -13694, 0 }, + { 13694, -17076, 7600, 0 }, +}; + +// LGT4 name: lgt4_060_000w3 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_060_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 9449, 13755, 16075, 0 }, + { 17547, 4740, -14370, 0 }, + { 11819, -18034, 8483, 0 }, +}; + +// LGT4 name: lgt4_000w3 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 13377, 13377, 13377, 0 }, + { 16384, 0, -16384, 0 }, + { 9459, -18919, 9459, 0 }, +}; + +// LGT4 name: lgt4_150_000w2 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_150_000w2[4][4] = { + { 10362, 20724, 0, 0 }, + { 20724, -10362, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_100_000w2 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_100_000w2[4][4] = { + { 12181, 19710, 0, 0 }, + { 19710, -12181, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_060_000w2 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_060_000w2[4][4] = { + { 13831, 18590, 0, 0 }, + { 18590, -13831, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_000w2 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_000w2[4][4] = { + { 16384, 16384, 0, 0 }, + { 16384, -16384, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_150_000w1 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_150_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_100_000w1 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_100_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_060_000w1 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_060_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_000w1 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_060 +// Self loops: 0.600, 0.000, 0.000, 0.000 // Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_140[4][4] = { - { 4206, 9518, 13524, 15674 }, - { 11552, 14833, 1560, -13453 }, - { 15391, -1906, -14393, 9445 }, - { 12201, -14921, 12016, -4581 }, +static const tran_high_t lgt4_060[4][4] = { + { 6971, 10504, 13060, 14400 }, + { 14939, 11211, -2040, -13559 }, + { 14096, -8258, -12561, 10593 }, + { 8150, -15253, 14295, -5784 }, +}; + +// LGT4 name: lgt4_150 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_150[4][4] = { + { 3998, 9435, 13547, 15759 }, + { 11106, 15105, 1886, -13483 }, + { 15260, -1032, -14674, 9361 }, + { 12833, -14786, 11596, -4372 }, +}; + +// LGT8 name: lgt8_150_000w7 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_150_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 }, + { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 }, + { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 }, + { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 }, + { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 }, + { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 }, + { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 }, +}; + +// LGT8 name: lgt8_100_000w7 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_100_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 }, + { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 }, + { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 }, + { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 }, + { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 }, + { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 }, + { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 }, +}; + +// LGT8 name: lgt8_060_000w7 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_060_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 }, + { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 }, + { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 }, + { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 }, + { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 }, + { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 }, + { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 }, +}; + +// LGT8 name: lgt8_000w7 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 }, + { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 }, + { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 }, + { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 }, + { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 }, + { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 }, + { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 }, +}; + +// LGT8 name: lgt8_150_000w6 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_150_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 }, + { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 }, + { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 }, + { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 }, + { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 }, + { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 }, +}; + +// LGT8 name: lgt8_100_000w6 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_100_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 }, + { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 }, + { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 }, + { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 }, + { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 }, + { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 }, +}; + +// LGT8 name: lgt8_060_000w6 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_060_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 }, + { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 }, + { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 }, + { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 }, + { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 }, + { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 }, +}; + +// LGT8 name: lgt8_000w6 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 }, + { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 }, + { 16384, 0, -16384, -16384, 0, 16384, 0, 0 }, + { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 }, + { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 }, + { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 }, +}; + +// LGT8 name: lgt8_150_000w5 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 }, + { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 }, + { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 }, + { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 }, + { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_100_000w5 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 }, + { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 }, + { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 }, + { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 }, + { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_060_000w5 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 }, + { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 }, + { 19547, 569, -19303, -8852, 15505, 0, 0, 0 }, + { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 }, + { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_000w5 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 }, + { 19710, 12181, 0, -12181, -19710, 0, 0, 0 }, + { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 }, + { 12181, -19710, 0, 19710, -12181, 0, 0, 0 }, + { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_150_000w4 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w4[8][8] = { + { 5655, 13343, 19159, 22286, 0, 0, 0, 0 }, + { 15706, 21362, 2667, -19068, 0, 0, 0, 0 }, + { 21580, -1459, -20752, 13238, 0, 0, 0, 0 }, + { 18148, -20910, 16399, -6183, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_100_000w4 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w4[8][8] = { + { 7472, 14042, 18919, 21513, 0, 0, 0, 0 }, + { 18919, 18919, 0, -18919, 0, 0, 0, 0 }, + { 21513, -7472, -18919, 14042, 0, 0, 0, 0 }, + { 14042, -21513, 18919, -7472, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_060_000w4 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w4[8][8] = { + { 9858, 14855, 18470, 20365, 0, 0, 0, 0 }, + { 21127, 15855, -2886, -19175, 0, 0, 0, 0 }, + { 19935, -11679, -17764, 14980, 0, 0, 0, 0 }, + { 11525, -21570, 20217, -8180, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_000w4 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w4[8][8] = { + { 16384, 16384, 16384, 16384, 0, 0, 0, 0 }, + { 21407, 8867, -8867, -21407, 0, 0, 0, 0 }, + { 16384, -16384, -16384, 16384, 0, 0, 0, 0 }, + { 8867, -21407, 21407, -8867, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_150_000w3 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w3[8][8] = { + { 8473, 19144, 25209, 0, 0, 0, 0, 0 }, + { 21942, 15257, -18961, 0, 0, 0, 0, 0 }, + { 22815, -21783, 8874, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_100_000w3 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w3[8][8] = { + { 10747, 19366, 24149, 0, 0, 0, 0, 0 }, + { 24149, 10747, -19366, 0, 0, 0, 0, 0 }, + { 19366, -24149, 10747, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_060_000w3 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w3[8][8] = { + { 13363, 19452, 22733, 0, 0, 0, 0, 0 }, + { 24815, 6704, -20323, 0, 0, 0, 0, 0 }, + { 16715, -25503, 11997, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_000w3 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w3[8][8] = { + { 18919, 18919, 18919, 0, 0, 0, 0, 0 }, + { 23170, 0, -23170, 0, 0, 0, 0, 0 }, + { 13377, -26755, 13377, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_150_000w2 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w2[8][8] = { + { 14654, 29309, 0, 0, 0, 0, 0, 0 }, + { 29309, -14654, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_100_000w2 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w2[8][8] = { + { 17227, 27874, 0, 0, 0, 0, 0, 0 }, + { 27874, -17227, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_060_000w2 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w2[8][8] = { + { 19560, 26290, 0, 0, 0, 0, 0, 0 }, + { 26290, -19560, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_000w2 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w2[8][8] = { + { 23170, 23170, 0, 0, 0, 0, 0, 0 }, + { 23170, -23170, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_150_000w1 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_100_000w1 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_060_000w1 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_000w1 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_060 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060[8][8] = { + { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 }, + { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 }, + { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 }, + { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 }, + { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 }, + { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 }, + { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 }, + { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 }, +}; + +// LGT8 name: lgt8_100 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100[8][8] = { + { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 }, + { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 }, + { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 }, + { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 }, + { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 }, + { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 }, + { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 }, + { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 }, }; +#endif // CONFIG_LGT_FROM_PRED +#if CONFIG_LGT || CONFIG_LGT_FROM_PRED // LGT4 name: lgt4_170 // Self loops: 1.700, 0.000, 0.000, 0.000 // Edges: 1.000, 1.000, 1.000 @@ -118,18 +707,14 @@ static const tran_high_t lgt4_170[4][4] = { { 14138, -14420, 10663, -3920 }, }; -// LGT8 name: lgt8_150 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150[8][8] = { - { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, - { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, - { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, - { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, - { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, - { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, - { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, - { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, +// LGT4 name: lgt4_140 +// Self loops: 1.400, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_140[4][4] = { + { 4206, 9518, 13524, 15674 }, + { 11552, 14833, 1560, -13453 }, + { 15391, -1906, -14393, 9445 }, + { 12201, -14921, 12016, -4581 }, }; // LGT8 name: lgt8_170 @@ -145,5 +730,19 @@ static const tran_high_t lgt8_170[8][8] = { { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 }, { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 }, }; -#endif // CONFIG_LGT + +// LGT8 name: lgt8_150 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150[8][8] = { + { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, + { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, + { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, + { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, + { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, + { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, + { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, + { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, +}; +#endif // CONFIG_LGT || CONFIG_LGT_FROM_PRED #endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index a4c3616e7..3c99aa155 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -256,7 +256,13 @@ VARIANCES(4, 16) VARIANCES(16, 4) VARIANCES(8, 32) VARIANCES(32, 8) -#endif +VARIANCES(16, 64) +VARIANCES(64, 16) +#if CONFIG_EXT_PARTITION +VARIANCES(32, 128) +VARIANCES(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES GET_VAR(16, 16) GET_VAR(8, 8) @@ -661,7 +667,13 @@ HIGHBD_VARIANCES(4, 16) HIGHBD_VARIANCES(16, 4) HIGHBD_VARIANCES(8, 32) HIGHBD_VARIANCES(32, 8) -#endif +HIGHBD_VARIANCES(16, 64) +HIGHBD_VARIANCES(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(32, 128) +HIGHBD_VARIANCES(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) @@ -761,7 +773,7 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, } #endif // CONFIG_HIGHBITDEPTH -#if CONFIG_AV1 && CONFIG_EXT_INTER +#if CONFIG_AV1 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, @@ -848,7 +860,13 @@ MASK_SUBPIX_VAR(4, 16) MASK_SUBPIX_VAR(16, 4) MASK_SUBPIX_VAR(8, 32) MASK_SUBPIX_VAR(32, 8) -#endif +MASK_SUBPIX_VAR(16, 64) +MASK_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR(32, 128) +MASK_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #if CONFIG_HIGHBITDEPTH void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, @@ -985,9 +1003,15 @@ HIGHBD_MASK_SUBPIX_VAR(4, 16) HIGHBD_MASK_SUBPIX_VAR(16, 4) HIGHBD_MASK_SUBPIX_VAR(8, 32) HIGHBD_MASK_SUBPIX_VAR(32, 8) -#endif +HIGHBD_MASK_SUBPIX_VAR(16, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR(32, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, @@ -1094,7 +1118,17 @@ OBMC_VAR(8, 32) OBMC_SUBPIX_VAR(8, 32) OBMC_VAR(32, 8) OBMC_SUBPIX_VAR(32, 8) -#endif +OBMC_VAR(16, 64) +OBMC_SUBPIX_VAR(16, 64) +OBMC_VAR(64, 16) +OBMC_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +OBMC_VAR(32, 128) +OBMC_SUBPIX_VAR(32, 128) +OBMC_VAR(128, 32) +OBMC_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #if CONFIG_HIGHBITDEPTH static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, @@ -1287,6 +1321,16 @@ HIGHBD_OBMC_VAR(8, 32) HIGHBD_OBMC_SUBPIX_VAR(8, 32) HIGHBD_OBMC_VAR(32, 8) HIGHBD_OBMC_SUBPIX_VAR(32, 8) -#endif +HIGHBD_OBMC_VAR(16, 64) +HIGHBD_OBMC_SUBPIX_VAR(16, 64) +HIGHBD_OBMC_VAR(64, 16) +HIGHBD_OBMC_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_OBMC_VAR(32, 128) +HIGHBD_OBMC_SUBPIX_VAR(32, 128) +HIGHBD_OBMC_VAR(128, 32) +HIGHBD_OBMC_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index 20f0895cb..a193df467 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -54,7 +54,7 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred); -#if CONFIG_AV1 && CONFIG_EXT_INTER +#if CONFIG_AV1 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, @@ -64,7 +64,7 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, @@ -90,10 +90,8 @@ typedef struct aom_variance_vtable { aom_sad_multi_fn_t sdx3f; aom_sad_multi_fn_t sdx8f; aom_sad_multi_d_fn_t sdx4df; -#if CONFIG_EXT_INTER aom_masked_sad_fn_t msdf; aom_masked_subpixvariance_fn_t msvf; -#endif // CONFIG_EXT_INTER #if CONFIG_MOTION_VAR aom_obmc_sad_fn_t osdf; aom_obmc_variance_fn_t ovf; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 357f37401..8688fb544 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -346,9 +346,15 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ psraw m0, 7 psraw m4, 7 %ifidn %1, h8_add_src +%if ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif movu m5, [srcq] mova m7, m5 - pand m5, [even_byte_mask] + pand m5, m2 psrlw m7, 8 paddsw m0, m5 paddsw m4, m7 diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h new file mode 100644 index 000000000..5f9596a74 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_COMMON_AVX2_H +#define AOM_DSP_X86_COMMON_AVX2_H + +#include <immintrin.h> + +#include "./aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h index d3aceae00..86df4a6f6 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h @@ -15,21 +15,21 @@ #include "./aom_config.h" static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { -#if CONFIG_HIGHBITDEPTH - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + if (sizeof(tran_low_t) == 4) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); -#else - _mm256_storeu_si256((__m256i *)out, *coeff); -#endif + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); + } else { + _mm256_storeu_si256((__m256i *)out, *coeff); + } } #endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 26b2db2e0..58e8971dd 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32( } static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_store_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_store_si128((__m128i *)(dst_ptr), *poutput); + } } static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c new file mode 100644 index 000000000..41b55c985 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// D45E_PRED +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y, + const __m256i *z) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a = _mm256_avg_epu16(*x, *z); + const __m256i b = + _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one)); + return _mm256_avg_epu16(b, *y); +} + +static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1, + const __m256i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m256i y = avg3_epu16(a0, a1, a2); + _mm256_storeu_si256((__m256i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 9)); + x0 = _mm256_insert_epi16(x0, above[23], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + x2 = _mm256_insert_epi16(x2, above[31], 15); + const __m256i y = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + x0 = _mm256_insert_epi16(x0, above[47], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i u = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst1, u); + + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17)); + y2 = _mm256_insert_epi16(y2, above[47], 15); + u = avg3_epu16(&y0, &y1, &y2); + _mm256_storeu_si256((__m256i *)dst2, u); +} + +void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + __m256i u = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst1, u); + + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33)); + y0 = _mm256_insert_epi16(y0, above[63], 15); + u = avg3_epu16(&y1, &y2, &y0); + _mm256_storeu_si256((__m256i *)dst2, u); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm index 5d84ef8a7..91b3d126c 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm @@ -257,200 +257,3 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above dec nlines4d jnz .loop REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps - movd m1, [aboveq-2] - movq m0, [aboveq] - pshuflw m1, m1, 0x0 - movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 - movlhps m1, m1 ; tl tl tl tl tl tl tl tl - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl - psllw m3, m4 - pcmpeqw m2, m2 - pxor m4, m4 ; min possible value - pxor m3, m2 ; max possible value - mova m1, [leftq] - pshuflw m2, m1, 0x0 - pshuflw m5, m1, 0x55 - movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - pshuflw m2, m1, 0xaa - pshuflw m5, m1, 0xff - movlhps m2, m5 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one - movd m1, [aboveq-2] - mova m0, [aboveq] - pshuflw m1, m1, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m3, m3 - pxor m4, m4 - pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 - pshuflw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m3, m3 - mov lineq, -4 - mova m2, m3 - punpcklqdq m1, m1 - psllw m3, m4 - add leftq, 16 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movd m1, [leftq+lineq*4] - movd m2, [leftq+lineq*4+2] - pshuflw m1, m1, 0x0 - pshuflw m2, m2, 0x0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - paddw m1, m0 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m1, m3 - pminsw m2, m3 - pmaxsw m1, m4 - pmaxsw m2, m4 - ;Store the values - mova [dstq ], m1 - mova [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps - movd m2, [aboveq-2] - mova m0, [aboveq] - mova m1, [aboveq+16] - pshuflw m2, m2, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - punpcklqdq m2, m2 - psllw m3, m4 - pcmpeqw m5, m5 - pxor m4, m4 ; min possible value - pxor m3, m5 ; max possible value - DEFINE_ARGS dst, stride, line, left - mov lineq, -8 - psubw m0, m2 - psubw m1, m2 -.loop: - movd m7, [leftq] - pshuflw m5, m7, 0x0 - pshuflw m2, m7, 0x55 - punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 - punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 - paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 - pminsw m6, m3 - pminsw m5, m3 - pmaxsw m6, m4 ; Clamp to the bit-depth - pmaxsw m5, m4 - mova [dstq ], m6 - mova [dstq +16], m5 - paddw m6, m2, m0 - paddw m2, m1 - pminsw m6, m3 - pminsw m2, m3 - pmaxsw m6, m4 - pmaxsw m2, m4 - mova [dstq+strideq*2 ], m6 - mova [dstq+strideq*2+16], m2 - lea dstq, [dstq+strideq*4] - inc lineq - lea leftq, [leftq+4] - - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps - movd m0, [aboveq-2] - mova m1, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - pshuflw m0, m0, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m5, m5 - movd m6, bpsd - psllw m5, m6 - pcmpeqw m7, m7 - pxor m6, m6 ; min possible value - pxor m5, m7 ; max possible value - punpcklqdq m0, m0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - psubw m1, m0 - psubw m2, m0 - psubw m3, m0 - psubw m4, m0 -.loop: - movd m7, [leftq] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +48], m0 - movd m7, [leftq+2] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2 ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+48], m0 - lea dstq, [dstq+strideq*4] - lea leftq, [leftq+4] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 000000000..691e166cf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,1256 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); +} + +void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i h76543210 = _mm_load_si128((const __m128i *)above); + __m128i hx7654321 = _mm_srli_si128(h76543210, 2); + __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); + __m128i hx8765432 = _mm_srli_si128(h87654321, 2); + __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); + __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); + dst += stride; + + // hcba98765 + h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); + h76543210 = _mm_insert_epi16(h76543210, above[11], 7); + // hxcba9876 + hx7654321 = _mm_srli_si128(h76543210, 2); + // hxxcba987 + hx8765432 = _mm_srli_si128(h76543210, 4); + avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); +} + +void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + __m128i y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x0 = _mm_loadu_si128((const __m128i *)(above + 3)); + y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 4)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 5)); + x2 = _mm_insert_epi16(x2, above[11], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} + +static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, + const __m128i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m128i y = avg3_epu16(a0, a1, a2); + _mm_storeu_si128((__m128i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm_loadu_si128((const __m128i *)(above + 9)); + x0 = _mm_insert_epi16(x0, above[15], 7); + const __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); +} + +void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm_loadu_si128((const __m128i *)(above + 15)); + __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 16)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 17)); + x2 = _mm_insert_epi16(x2, above[23], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c new file mode 100644 index 000000000..b089a3f43 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 +}; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 000000000..94c68885c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m256i *blt, + __m256i *lt, __m256i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *blt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *lt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *thr = _mm256_slli_epi16(y, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m256i *p, __m256i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); + q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); + } +} + +static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, + const __m256i *t, __m256i *hev) { + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); + __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); + h = _mm256_subs_epu16(h, *t); + + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + const __m256i zero = _mm256_setzero_si256(); + *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); +} + +static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, + const __m256i *l, const __m256i *bl, + __m256i *mask) { + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); + max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); + } + max = _mm256_subs_epu16(max, *l); + *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, + const __m256i *q, int bd, int start, + int end, __m256i *flat) { + __m256i max = _mm256_setzero_si256(); + int i; + for (i = start; i < end; ++i) { + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); + } + + __m256i ft; + if (bd == 8) + ft = _mm256_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); + + const __m256i zero = _mm256_setzero_si256(); + *flat = _mm256_cmpeq_epi16(ft, zero); +} + +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, + __m256i *pixel) { + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(*pixel, *max); + clamped = _mm256_andnot_si256(mask, *pixel); + mask = _mm256_and_si256(mask, *max); + clamped = _mm256_or_si256(mask, clamped); + + mask = _mm256_cmpgt_epi16(clamped, *min); + clamped = _mm256_and_si256(mask, clamped); + mask = _mm256_andnot_si256(mask, *min); + *pixel = _mm256_or_si256(clamped, mask); +} + +static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, + const __m256i *th, int bd, __m256i *ps, + __m256i *qs) { + __m256i t80; + if (bd == 8) + t80 = _mm256_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm256_set1_epi16(0x200); + else // bd == 12 + t80 = _mm256_set1_epi16(0x800); + + __m256i ps0 = _mm256_subs_epi16(p[0], t80); + __m256i ps1 = _mm256_subs_epi16(p[1], t80); + __m256i qs0 = _mm256_subs_epi16(q[0], t80); + __m256i qs1 = _mm256_subs_epi16(q[1], t80); + + const __m256i one = _mm256_set1_epi16(1); + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i zero = _mm256_setzero_si256(); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filter = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + __m256i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm256_and_si256(filter, hev); + + const __m256i x = _mm256_subs_epi16(qs0, ps0); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm256_and_si256(filter, *mask); + + const __m256i t3 = _mm256_set1_epi16(3); + const __m256i t4 = _mm256_set1_epi16(4); + + __m256i filter1 = _mm256_adds_epi16(filter, t4); + __m256i filter2 = _mm256_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm256_srai_epi16(filter1, 3); + filter2 = _mm256_srai_epi16(filter2, 3); + + qs0 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + + qs[0] = _mm256_adds_epi16(qs0, t80); + ps[0] = _mm256_adds_epi16(ps0, t80); + + filter = _mm256_adds_epi16(filter1, one); + filter = _mm256_srai_epi16(filter, 1); + filter = _mm256_andnot_si256(hev, filter); + + qs1 = _mm256_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm256_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + + qs[1] = _mm256_adds_epi16(qs1, t80); + ps[1] = _mm256_adds_epi16(ps1, t80); +} +#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 + +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} +#else +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + __m256i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); + + __m256i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); + + __m256i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m256i flat, flat2; + const __m256i one = _mm256_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm256_and_si256(flat, mask); + flat2 = _mm256_and_si256(flat2, flat); + + __m256i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); + + // flat and wide flat calculations + __m256i flat_p[3], flat_q[3]; + __m256i flat2_p[7], flat2_q[7]; + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + + __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), + _mm256_add_epi16(p[4], p[3])); + __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), + _mm256_add_epi16(q[4], q[3])); + + __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); + sum_p = _mm256_add_epi16(sum_p, sum_lp); + + __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); + sum_q = _mm256_add_epi16(sum_q, sum_lq); + sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); + sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); + flat2_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); + flat_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); + flat_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); + + __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); + __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); + __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); + __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); + + sum_q = _mm256_sub_epi16(sum_p, p[6]); + sum_p = _mm256_sub_epi16(sum_p, q[6]); + flat2_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm256_sub_epi16(sum_lp, p[2]); + sum_lp = _mm256_sub_epi16(sum_lp, q[2]); + flat_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p3 = _mm256_add_epi16(sum_p3, p[3]); + sum_q3 = _mm256_add_epi16(sum_q3, q[3]); + + sum_p = _mm256_sub_epi16(sum_p, q[5]); + sum_q = _mm256_sub_epi16(sum_q, p[5]); + flat2_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm256_sub_epi16(sum_lp, q[1]); + sum_lq = _mm256_sub_epi16(sum_lq, p[1]); + flat_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm256_andnot_si256(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm256_and_si256(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm256_andnot_si256(flat, q[2]); + flat_q[2] = _mm256_and_si256(flat, flat_q[2]); + q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm256_andnot_si256(flat, ps[i]); + flat_p[i] = _mm256_and_si256(flat, flat_p[i]); + p[i] = _mm256_or_si256(ps[i], flat_p[i]); + qs[i] = _mm256_andnot_si256(flat, qs[i]); + flat_q[i] = _mm256_and_si256(flat, flat_q[i]); + q[i] = _mm256_or_si256(qs[i], flat_q[i]); + } + + // highbd_filter16 + + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm256_andnot_si256(flat2, p[i]); + flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm256_andnot_si256(flat2, q[i]); + flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); + q[i] = _mm256_or_si256(q[i], flat2_q[i]); + _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); + _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); + } +} + +static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, + uint16_t *dst, int dst_p) { + __m256i x[16]; + int i; + for (i = 0; i < 16; ++i) { + x[i] = _mm256_loadu_si256((const __m256i *)src); + src += src_p; + } + mm256_transpose_16x16(x, x); + for (i = 0; i < 16; ++i) { + _mm256_storeu_si256((__m256i *)dst, x[i]); + dst += dst_p; + } +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose16x16(s - 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose16x16(t_dst, 16, s - 8, p); +} + +static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, + const uint8_t *t0, const uint8_t *b1, + const uint8_t *l1, const uint8_t *t1, int bd, + __m256i *blt, __m256i *lt, __m256i *thr) { + const __m128i z128 = _mm_setzero_si128(); + const __m128i blimit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); + const __m128i limit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); + const __m128i thresh0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); + const __m128i blimit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); + const __m128i limit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); + const __m128i thresh1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); + + *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); + *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); + *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); + + int shift = bd - 8; + *blt = _mm256_slli_epi16(*blt, shift); + *lt = _mm256_slli_epi16(*lt, shift); + *thr = _mm256_slli_epi16(*thr, shift); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80, tff80, tffe0, t1f, t7f; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + tff80 = _mm256_set1_epi16(0xff80); + tffe0 = _mm256_set1_epi16(0xffe0); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); + } else { // bd == 12 + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); + } + + __m256i ps1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); + __m256i ps0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); + __m256i qs0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); + __m256i qs1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + const __m256i zero = _mm256_setzero_si256(); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + __m256i work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), + _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), + _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // filter4 + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + __m256i work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); + + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + __m256i filter1 = _mm256_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + __m256i filter2 = _mm256_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); + + // Filter1 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm256_srli_epi16(filter1, 3); + work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm256_and_si256(filter1, t1f); // clamp the range + filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter2); + filter2 = _mm256_srli_epi16(filter2, 3); + work_a = _mm256_and_si256(work_a, tffe0); + filter2 = _mm256_and_si256(filter2, t1f); + filter2 = _mm256_or_si256(filter2, work_a); + + // filt >> 1 + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + filt = _mm256_adds_epi16(filter1, one); + work_a = _mm256_cmpgt_epi16(zero, filt); + filt = _mm256_srli_epi16(filt, 1); + work_a = _mm256_and_si256(work_a, tff80); + filt = _mm256_and_si256(filt, t7f); + filt = _mm256_or_si256(filt, work_a); + + filt = _mm256_andnot_si256(hev, filt); + + filter1 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &filter1); + q0 = _mm256_adds_epi16(filter1, t80); + + filter1 = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &filter1); + q1 = _mm256_adds_epi16(filter1, t80); + + filter2 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &filter2); + p0 = _mm256_adds_epi16(filter2, t80); + + filter2 = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &filter2); + p1 = _mm256_adds_epi16(filter2, t80); + + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + } else if (bd == 10) { + t80 = _mm256_set1_epi16(0x200); + } else { // bd == 12 + t80 = _mm256_set1_epi16(0x800); + } + + __m256i ps1, ps0, qs0, qs1; + ps1 = _mm256_subs_epi16(p1, t80); + ps0 = _mm256_subs_epi16(p0, t80); + qs0 = _mm256_subs_epi16(q0, t80); + qs1 = _mm256_subs_epi16(q1, t80); + + // filter_mask and hev_mask + __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i zero = _mm256_set1_epi16(0); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm256_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); + flat = _mm256_max_epi16(work, flat); + flat = _mm256_max_epi16(abs_p1p0, flat); + flat = _mm256_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm256_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); + + flat = _mm256_cmpeq_epi16(flat, zero); + flat = _mm256_and_si256(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + __m256i workp_a, workp_b, workp_shft; + workp_a = + _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); + const __m256i four = _mm256_set1_epi16(4); + workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); + + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); + + // lp filter + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt, filter1, filter2, work_a; + filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + filter1 = _mm256_adds_epi16(filt, t4); + filter2 = _mm256_adds_epi16(filt, t3); + + // Filter1 >> 3 + pixel_clamp(&pmin, &pmax, &filter1); + filter1 = _mm256_srai_epi16(filter1, 3); + + // Filter2 >> 3 + pixel_clamp(&pmin, &pmax, &filter2); + filter2 = _mm256_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm256_adds_epi16(filter1, one); + filt = _mm256_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm256_andnot_si256(hev, filt); + + work_a = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q0 = _mm256_loadu_si256((__m256i *)flat_oq0); + work_a = _mm256_andnot_si256(flat, work_a); + q0 = _mm256_and_si256(flat, q0); + q0 = _mm256_or_si256(work_a, q0); + + work_a = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q1 = _mm256_loadu_si256((__m256i *)flat_oq1); + work_a = _mm256_andnot_si256(flat, work_a); + q1 = _mm256_and_si256(flat, q1); + q1 = _mm256_or_si256(work_a, q1); + + work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + q2 = _mm256_loadu_si256((__m256i *)flat_oq2); + work_a = _mm256_andnot_si256(flat, work_a); + q2 = _mm256_and_si256(flat, q2); + q2 = _mm256_or_si256(work_a, q2); + + work_a = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p0 = _mm256_loadu_si256((__m256i *)flat_op0); + work_a = _mm256_andnot_si256(flat, work_a); + p0 = _mm256_and_si256(flat, p0); + p0 = _mm256_or_si256(work_a, p0); + + work_a = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p1 = _mm256_loadu_si256((__m256i *)flat_op1); + work_a = _mm256_andnot_si256(flat, work_a); + p1 = _mm256_and_si256(flat, p1); + p1 = _mm256_or_si256(work_a, p1); + + work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + p2 = _mm256_loadu_si256((__m256i *)flat_op2); + work_a = _mm256_andnot_si256(flat, work_a); + p2 = _mm256_and_si256(flat, p2); + p2 = _mm256_or_si256(work_a, p2); + + _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); + _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} +#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 76369871b..0a399edf2 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -12,135 +12,135 @@ #include <emmintrin.h> // SSE2 #include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" +#include "aom_dsp/x86/lpf_common_sse2.h" #include "aom_ports/emmintrin_compat.h" +#include "aom_ports/mem.h" -static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { - __m128i ubounded; - __m128i lbounded; - __m128i retval; +static INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + __m128i clamped, mask; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i t80, max, min; + mask = _mm_cmpgt_epi16(*pixel, *max); + clamped = _mm_andnot_si128(mask, *pixel); + mask = _mm_and_si128(mask, *max); + clamped = _mm_or_si128(mask, clamped); - if (bd == 8) { - t80 = _mm_set1_epi16(0x80); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); - } else if (bd == 10) { - t80 = _mm_set1_epi16(0x200); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); - } else { // bd == 12 - t80 = _mm_set1_epi16(0x800); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); - } + mask = _mm_cmpgt_epi16(clamped, *min); + clamped = _mm_and_si128(mask, clamped); + mask = _mm_andnot_si128(mask, *min); + *pixel = _mm_or_si128(clamped, mask); +} - min = _mm_subs_epi16(zero, t80); +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); - ubounded = _mm_cmpgt_epi16(value, max); - lbounded = _mm_cmplt_epi16(value, min); - retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); - ubounded = _mm_and_si128(ubounded, max); - lbounded = _mm_and_si128(lbounded, min); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_or_si128(retval, lbounded); - return retval; -} + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); -// TODO(debargha, peter): Break up large functions into smaller ones -// in this file. -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; - __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; - __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; - __m128i ps1, qs1, ps0, qs0; - __m128i abs_p0q0, abs_p1q1, ffff, work; - __m128i filt, work_a, filter1, filter2; - __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; - __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; - __m128i flat2_q0, flat2_p0; - __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3; - __m128i t4, t3, t80, t1; - __m128i eight, four; + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } +} +// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); +static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, + const __m128i *t, __m128i *hev) { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *t); - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - - // highbd_filter_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i zero = _mm_setzero_si128(); + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); +} - ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); +static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), + _mm_subs_epu16(p[i - 1], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), + _mm_subs_epu16(q[i - 1], q[i]))); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} - // highbd_hev_mask (in C code this is actually called from highbd_filter4) - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, + const __m128i *q, int bd, int start, + int end, __m128i *flat) { + __m128i max = _mm_setzero_si128(); + int i; + for (i = start; i < end; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), + _mm_subs_epu16(p[0], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), + _mm_subs_epu16(q[0], q[i]))); + } - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); + __m128i ft; + if (bd == 8) + ft = _mm_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); - mask = _mm_subs_epu16(mask, limit); - mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} - // lp filter - // highbd_filter4 - t4 = _mm_set1_epi16(4); - t3 = _mm_set1_epi16(3); +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, + const __m128i *th, int bd, __m128i *ps, + __m128i *qs) { + __m128i t80; if (bd == 8) t80 = _mm_set1_epi16(0x80); else if (bd == 10) @@ -148,340 +148,283 @@ void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, else // bd == 12 t80 = _mm_set1_epi16(0x800); - t1 = _mm_set1_epi16(0x1); + __m128i ps0 = _mm_subs_epi16(p[0], t80); + __m128i ps1 = _mm_subs_epi16(p[1], t80); + __m128i qs0 = _mm_subs_epi16(q[0], t80); + __m128i qs1 = _mm_subs_epi16(q[1], t80); - ps1 = _mm_subs_epi16(p1, t80); - qs1 = _mm_subs_epi16(q1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, t80); - filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), - hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); - filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); - // Filter1 >> 3 - filter1 = _mm_srai_epi16(filter1, 0x3); - filter2 = _mm_srai_epi16(filter2, 0x3); + __m128i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm_and_si128(filter, hev); - qs0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - ps0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); - qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); - // end highbd_filter4 - // loopfilter done + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); - // highbd_flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); - work = _mm_max_epi16(abs_p1p0, abs_q1q0); - flat = _mm_max_epi16(work, flat); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); - flat = _mm_cmpeq_epi16(flat, zero); - // end flat_mask4 + qs[0] = _mm_adds_epi16(qs0, t80); + ps[0] = _mm_adds_epi16(ps0, t80); - // flat & mask = flat && mask (as used in filter8) - // (because, in both vars, each block of 16 either all 1s or all 0s) - flat = _mm_and_si128(flat, mask); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); - // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 - // but referred to as p0-p4 & q0-q4 in fn) - flat2 = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), - _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + qs[1] = _mm_adds_epi16(qs1, t80); + ps[1] = _mm_adds_epi16(ps1, t80); +} - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), - _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); - flat2 = _mm_max_epi16(work, flat2); +typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), - _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); - flat2 = _mm_max_epi16(work, flat2); +static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd, + PixelOutput pixel_output) { + __m128i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), - _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); - flat2 = _mm_max_epi16(work, flat2); + __m128i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); - if (bd == 8) - flat2 = _mm_subs_epu16(flat2, one); - else if (bd == 10) - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + __m128i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m128i flat, flat2; + const __m128i one = _mm_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); - flat2 = _mm_cmpeq_epi16(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - // end highbd_flat_mask5 + __m128i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations - eight = _mm_set1_epi16(8); - four = _mm_set1_epi16(4); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - flat2_p0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); - flat2_q0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); - flat_p0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); - flat_q0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); - - sum_p7 = _mm_add_epi16(p7, p7); - sum_q7 = _mm_add_epi16(q7, q7); - sum_p3 = _mm_add_epi16(p3, p3); - sum_q3 = _mm_add_epi16(q3, q3); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); - flat2_p1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); - flat2_q1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); - flat_p1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); - flat_q1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - sum_p3 = _mm_add_epi16(sum_p3, p3); - sum_q3 = _mm_add_epi16(sum_q3, q3); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); - flat2_p2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); - flat2_q2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); - flat_p2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); - flat_q2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); - flat2_p3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); - flat2_q3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); - flat2_p4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); - flat2_q4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); - flat2_p5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); - flat2_q5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); - flat2_p6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); - flat2_q6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // highbd_filter8 - p2 = _mm_andnot_si128(flat, p2); + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[7], flat2_q[7]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i sum_p = + _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); + __m128i sum_q = + _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); + + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); + flat2_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + + __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); + __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p, p[6]); + sum_p = _mm_sub_epi16(sum_p, q[6]); + flat2_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + + sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_q, p[5]); + flat2_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p = _mm_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) - flat_p2 = _mm_and_si128(flat, flat_p2); + flat_p[2] = _mm_and_si128(flat, flat_p[2]); // when (flat && mask) - p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values - - ps1 = _mm_andnot_si128(flat, ps1); - // p1 takes the value assigned to in in filter4 if !(flat && mask) - flat_p1 = _mm_and_si128(flat, flat_p1); - // when (flat && mask) - p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values - qs1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values - - ps0 = _mm_andnot_si128(flat, ps0); - // p0 takes the value assigned to in in filter4 if !(flat && mask) - flat_p0 = _mm_and_si128(flat, flat_p0); - // when (flat && mask) - p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values - qs0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values - // end highbd_filter8 + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } // highbd_filter16 - p6 = _mm_andnot_si128(flat2, p6); - // p6 remains unchanged if !(flat2 && flat && mask) - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - // get values for when (flat2 && flat && mask) - p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values - q6 = _mm_andnot_si128(flat2, q6); - // q6 remains unchanged if !(flat2 && flat && mask) - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - // get values for when (flat2 && flat && mask) - q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); - - p5 = _mm_andnot_si128(flat2, p5); - // p5 remains unchanged if !(flat2 && flat && mask) - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - // get values for when (flat2 && flat && mask) - p5 = _mm_or_si128(p5, flat2_p5); - // full list of p5 values - q5 = _mm_andnot_si128(flat2, q5); - // q5 remains unchanged if !(flat2 && flat && mask) - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - // get values for when (flat2 && flat && mask) - q5 = _mm_or_si128(q5, flat2_q5); - // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); - - p4 = _mm_andnot_si128(flat2, p4); - // p4 remains unchanged if !(flat2 && flat && mask) - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - // get values for when (flat2 && flat && mask) - p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values - q4 = _mm_andnot_si128(flat2, q4); - // q4 remains unchanged if !(flat2 && flat && mask) - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - // get values for when (flat2 && flat && mask) - q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); - - p3 = _mm_andnot_si128(flat2, p3); - // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - // get values for when (flat2 && flat && mask) - p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values - q3 = _mm_andnot_si128(flat2, q3); - // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - // get values for when (flat2 && flat && mask) - q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); - - p2 = _mm_andnot_si128(flat2, p2); - // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - // get values for when (flat2 && flat && mask) - p2 = _mm_or_si128(p2, flat2_p2); - // full list of p2 values - q2 = _mm_andnot_si128(flat2, q2); - // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - // get values for when (flat2 && flat && mask) - q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); - - p1 = _mm_andnot_si128(flat2, p1); - // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - // get values for when (flat2 && flat && mask) - p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values - q1 = _mm_andnot_si128(flat2, q1); - // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - // get values for when (flat2 && flat && mask) - q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - - p0 = _mm_andnot_si128(flat2, p0); - // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - // get values for when (flat2 && flat && mask) - p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values - q0 = _mm_andnot_si128(flat2, q0); - // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - // get values for when (flat2 && flat && mask) - q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + + if (pixel_output == FOUR_PIXELS) { + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); + } + } else { // EIGHT_PIXELS + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } + } +} + +// Note: +// highbd_lpf_horz_edge_8_8p() output 8 pixels per register +// highbd_lpf_horz_edge_8_4p() output 4 pixels per register +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); +} +#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + +static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); +} + +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); +#endif } void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { - aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); - aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); + highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); +#endif +} + +static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, + const __m128i *p0, const __m128i *q0, + const __m128i *q1, const __m128i *q2, + int p, uint16_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); +#else + _mm_store_si128((__m128i *)(s - 3 * p), *p2); + _mm_store_si128((__m128i *)(s - 2 * p), *p1); + _mm_store_si128((__m128i *)(s - 1 * p), *p0); + _mm_store_si128((__m128i *)(s + 0 * p), *q0); + _mm_store_si128((__m128i *)(s + 1 * p), *q1); + _mm_store_si128((__m128i *)(s + 2 * p), *q2); +#endif } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -497,14 +440,14 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -635,41 +578,48 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); // lp filter - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = signed_char_clamp_bd_sse2(filt, bd); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi16(filt, t4); filter2 = _mm_adds_epi16(filt, t3); // Filter1 >> 3 - filter1 = signed_char_clamp_bd_sse2(filter1, bd); + pixel_clamp(&pmin, &pmax, &filter1); filter1 = _mm_srai_epi16(filter1, 3); // Filter2 >> 3 - filter2 = signed_char_clamp_bd_sse2(filter2, bd); + pixel_clamp(&pmin, &pmax, &filter2); filter2 = _mm_srai_epi16(filter2, 3); // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; filt = _mm_andnot_si128(hev, filt); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q0 = _mm_load_si128((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q1 = _mm_load_si128((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); @@ -682,14 +632,16 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p0 = _mm_load_si128((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p1 = _mm_load_si128((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); @@ -702,12 +654,7 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); } void aom_highbd_lpf_horizontal_8_dual_sse2( @@ -725,14 +672,18 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -743,7 +694,7 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - __m128i work; + const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); __m128i t80; @@ -814,9 +765,9 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); mask = _mm_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16( + +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) + __m128i work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); mask = _mm_max_epi16(work, mask); @@ -824,22 +775,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); +#endif mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); // filter4 - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); // (aom_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + filter1 = _mm_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + + filter2 = _mm_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); // Filter1 >> 3 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 @@ -865,19 +826,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, filt = _mm_andnot_si128(hev, filt); - q0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - p0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); - + q0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &q0); + q0 = _mm_adds_epi16(q0, t80); + + q1 = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &q1); + q1 = _mm_adds_epi16(q1, t80); + + p0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &p0); + p0 = _mm_adds_epi16(p0, t80); + + p1 = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &p1); + p1 = _mm_adds_epi16(p1, t80); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); +#else _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +#endif } void aom_highbd_lpf_horizontal_4_dual_sse2( @@ -888,118 +862,6 @@ void aom_highbd_lpf_horizontal_4_dual_sse2( aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } -static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], - int out_p, int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; - do { - uint16_t *in = src[idx8x8]; - uint16_t *out = dst[idx8x8]; - - p0 = - _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - p1 = - _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - p2 = - _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - p3 = - _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - p4 = - _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - p5 = - _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - p6 = - _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - p7 = - _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 - x0 = _mm_unpacklo_epi16(p0, p1); - // 20 30 21 31 22 32 23 33 - x1 = _mm_unpacklo_epi16(p2, p3); - // 40 50 41 51 42 52 43 53 - x2 = _mm_unpacklo_epi16(p4, p5); - // 60 70 61 71 62 72 63 73 - x3 = _mm_unpacklo_epi16(p6, p7); - // 00 10 20 30 01 11 21 31 - x4 = _mm_unpacklo_epi32(x0, x1); - // 40 50 60 70 41 51 61 71 - x5 = _mm_unpacklo_epi32(x2, x3); - // 00 10 20 30 40 50 60 70 - x6 = _mm_unpacklo_epi64(x4, x5); - // 01 11 21 31 41 51 61 71 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); - // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); - // 01 11 21 31 41 51 61 71 - - // 02 12 22 32 03 13 23 33 - x4 = _mm_unpackhi_epi32(x0, x1); - // 42 52 62 72 43 53 63 73 - x5 = _mm_unpackhi_epi32(x2, x3); - // 02 12 22 32 42 52 62 72 - x6 = _mm_unpacklo_epi64(x4, x5); - // 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); - // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); - // 03 13 23 33 43 53 63 73 - - // 04 14 05 15 06 16 07 17 - x0 = _mm_unpackhi_epi16(p0, p1); - // 24 34 25 35 26 36 27 37 - x1 = _mm_unpackhi_epi16(p2, p3); - // 44 54 45 55 46 56 47 57 - x2 = _mm_unpackhi_epi16(p4, p5); - // 64 74 65 75 66 76 67 77 - x3 = _mm_unpackhi_epi16(p6, p7); - // 04 14 24 34 05 15 25 35 - x4 = _mm_unpacklo_epi32(x0, x1); - // 44 54 64 74 45 55 65 75 - x5 = _mm_unpacklo_epi32(x2, x3); - // 04 14 24 34 44 54 64 74 - x6 = _mm_unpacklo_epi64(x4, x5); - // 05 15 25 35 45 55 65 75 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); - // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); - // 05 15 25 35 45 55 65 75 - - // 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi32(x0, x1); - // 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi32(x2, x3); - // 06 16 26 36 46 56 66 76 - x6 = _mm_unpacklo_epi64(x4, x5); - // 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); - // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); - // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} - -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, - uint16_t *out, int out_p) { - uint16_t *src0[1]; - uint16_t *src1[1]; - uint16_t *dest0[1]; - uint16_t *dest1[1]; - src0[0] = in0; - src1[0] = in1; - dest0[0] = out; - dest1[0] = out + 8; - highbd_transpose(src0, in_p, dest0, out_p, 1); - highbd_transpose(src1, in_p, dest1, out_p, 1); -} - void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { @@ -1130,10 +992,12 @@ void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - // Loop filtering +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); +#else aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); - +#endif // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 9c3bbdd69..855bc6558 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -293,4 +293,6 @@ HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 248b98ef5..760e68aab 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -158,7 +158,10 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 - +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 +%endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -302,6 +305,8 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 %endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c index 7bc8a0df3..befd81269 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -177,177 +177,94 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); } -static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 8; - src += 8; - pred += 8; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 16; - src += 16; - pred += 16; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 32; - src += 32; - pred += 32; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 64; - src += 64; - pred += 64; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +#endif +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); } +SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); } +#endif static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { - SubtractWxHFuncType ret_func_ptr = NULL; if (rows == 4) { - if (cols == 4) { - ret_func_ptr = subtract_4x4; - } else if (cols == 8) { - ret_func_ptr = subtract_8x4; - } - } else if (rows == 8) { - if (cols == 4) { - ret_func_ptr = subtract_4x8; - } else if (cols == 8) { - ret_func_ptr = subtract_8x8; - } else if (cols == 16) { - ret_func_ptr = subtract_16x8; - } - } else if (rows == 16) { - if (cols == 8) { - ret_func_ptr = subtract_8x16; - } else if (cols == 16) { - ret_func_ptr = subtract_16x16; - } else if (cols == 32) { - ret_func_ptr = subtract_32x16; - } - } else if (rows == 32) { - if (cols == 16) { - ret_func_ptr = subtract_16x32; - } else if (cols == 32) { - ret_func_ptr = subtract_32x32; - } else if (cols == 64) { - ret_func_ptr = subtract_64x32; - } - } else if (rows == 64) { - if (cols == 32) { - ret_func_ptr = subtract_32x64; - } else if (cols == 64) { - ret_func_ptr = subtract_64x64; - } else if (cols == 128) { - ret_func_ptr = subtract_128x64; - } - } else if (rows == 128) { - if (cols == 64) { - ret_func_ptr = subtract_64x128; - } else if (cols == 128) { - ret_func_ptr = subtract_128x128; - } + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x32; +#endif // CONFIG_EXT_PARTITION + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x64; +#endif // CONFIG_EXT_PARTITION } - if (!ret_func_ptr) { - assert(0); +#if CONFIG_EXT_PARTITION + if (rows == 128) { + if (cols == 32) return subtract_32x128; + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; } - return ret_func_ptr; +#endif // CONFIG_EXT_PARTITION + assert(0); + return NULL; } void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 93923ffb0..62acf3ed3 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -189,6 +189,8 @@ VAR_FN(8, 8, 8, 6); VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); VAR_FN(32, 8, 16, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); #endif #undef VAR_FN @@ -411,7 +413,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)) + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ @@ -588,7 +592,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)); #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 000000000..6b8922b8c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> + +#include "./aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle + +// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. +// Use a header file, intrapred_common_x86.h +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint32_t sum = _mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm index 02567db49..9aece27be 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm @@ -623,149 +623,3 @@ cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left lea dstq, [dstq+strideq*4] jnz .loop REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x - punpcklbw m0, m1 - pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] - psrldq m0, 2 - psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] - movd m2, [leftq] - punpcklbw m2, m1 - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - pshuflw m4, m2, 0xaa - pshuflw m3, m2, 0xff - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movq m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] - pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] - DEFINE_ARGS dst, stride, line, left - mov lineq, -4 - punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] - psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] - movq m2, [leftq] - punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] -.loop: - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] - punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m3 - movq [dstq ], m4 - movhps [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m2, 4 - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left - pxor m1, m1 - mova m2, [aboveq-16]; - mova m0, [aboveq] ; t1 t2 ... t16 [byte] - punpckhbw m2, m1 ; [127:112] tl [word] - punpckhbw m4, m0, m1 - punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] - DEFINE_ARGS dst, stride, line, left, stride8 - mov lineq, -8 - pshufhw m2, m2, 0xff - mova m3, [leftq] ; l1 l2 ... l16 [byte] - punpckhqdq m2, m2 ; tl repeated 8 times [word] - psubw m0, m2 - psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] - punpckhbw m5, m3, m1 - punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] - lea stride8q, [strideq*8] -.loop: - pshuflw m6, m3, 0x0 - pshuflw m7, m5, 0x0 - punpcklqdq m6, m6 ; l1 repeated 8 times [word] - punpcklqdq m7, m7 ; l8 repeated 8 times [word] - paddw m1, m6, m0 - paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] - psrldq m5, 2 - packuswb m1, m6 - mova [dstq ], m1 - paddw m1, m7, m0 - paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] - psrldq m3, 2 - packuswb m1, m7 - mova [dstq+stride8q], m1 - inc lineq - lea dstq, [dstq+strideq] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - mova m4, [aboveq+16] - punpcklbw m2, m1 - punpckhbw m3, m0, m1 - punpckhbw m5, m4, m1 - punpcklbw m0, m1 - punpcklbw m4, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - punpcklqdq m2, m2 - add leftq, 32 - psubw m0, m2 - psubw m3, m2 - psubw m4, m2 - psubw m5, m2 -.loop: - movd m2, [leftq+lineq*2] - pxor m1, m1 - punpcklbw m2, m1 - pshuflw m7, m2, 0x55 - pshuflw m2, m2, 0x0 - punpcklqdq m2, m2 - punpcklqdq m7, m7 - paddw m6, m2, m3 - paddw m1, m2, m0 - packuswb m1, m6 - mova [dstq ], m1 - paddw m6, m2, m5 - paddw m1, m2, m4 - packuswb m1, m6 - mova [dstq+16 ], m1 - paddw m6, m7, m3 - paddw m1, m7, m0 - packuswb m1, m6 - mova [dstq+strideq ], m1 - paddw m6, m7, m5 - paddw m1, m7, m4 - packuswb m1, m6 - mova [dstq+strideq+16], m1 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 000000000..2a83b9001 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <emmintrin.h> + +#include "./aom_dsp_rtcd.h" + +static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { + int i; + for (i = 0; i < 4; ++i) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_16(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4x8(pred, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + (void)left; + int i; + for (i = 0; i < 16; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + int i = 0; + + do { + left_col = _mm_load_si128((const __m128i *)left); + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < 2); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 000000000..85b82744e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,885 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tmmintrin.h> + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16(0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); + pixels[1] = _mm_loadl_epi64((const __m128i *)left); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vecotr +// weights[2]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + const __m128i zero = _mm_setzero_si128(); + + weights[0] = _mm_unpacklo_epi8(t, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]); + + if (height == 8) { + t = _mm_srli_si128(t, 4); + weights[0] = _mm_unpacklo_epi8(t, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, weight[2]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 4, weights); + + smooth_pred_4xh(pixels, weights, 4, dst, stride); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 8, weights); + + smooth_pred_4xh(pixels, weights, 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + pixels[2] = _mm_load_si128((const __m128i *)left); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 4 : 8; + __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, 1/4 +// pixels[1]: above and below_pred interleave vector, 2/4 +// pixels[2]: above and below_pred interleave vector, 3/4 +// pixels[3]: above and below_pred interleave vector, 3/4 +// pixels[4]: left vector +// pixels[5]: left vector, h = 32 only +// pixels[6]: right_pred vector +static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab = _mm_load_si128((const __m128i *)above); + pixels[6] = _mm_set1_epi16((uint16_t)above[15]); + pixels[4] = _mm_load_si128((const __m128i *)left); + pixels[5] = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// ... ... +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +// ... ... +static INLINE void load_weight_w16(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 8) { + weight_h[0] = _mm_unpacklo_epi8(w8, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + } + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[4] : pixels[5]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc); + __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc); + + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[6]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + __m128i sum2 = _mm_madd_epi16(b, ww[2]); + __m128i sum3 = _mm_madd_epi16(b, ww[3]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + s2 = _mm_add_epi32(s2, sum2); + s2 = _mm_add_epi32(s2, round); + s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale); + + s3 = _mm_add_epi32(s3, sum3); + s3 = _mm_add_epi32(s3, round); + s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + sum1 = _mm_packus_epi16(s2, s3); + sum1 = _mm_shuffle_epi8(sum1, gat); + + _mm_storel_epi64((__m128i *)dst, sum0); + _mm_storel_epi64((__m128i *)(dst + 8), sum1); + + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 8, pixels); + + __m128i wh[2], ww[4]; + load_weight_w16(sm_weight_arrays, 8, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 16, pixels); + + __m128i wh[4], ww[4]; + load_weight_w16(sm_weight_arrays, 16, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 32, pixels); + + __m128i wh[8], ww[4]; + load_weight_w16(sm_weight_arrays, 32, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3); +} + +static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab0 = _mm_load_si128((const __m128i *)above); + __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16)); + + pixels[10] = _mm_set1_epi16((uint16_t)above[31]); + pixels[8] = _mm_load_si128((const __m128i *)left); + pixels[9] = _mm_load_si128((const __m128i *)(left + 16)); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab0, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab0, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpacklo_epi8(ab1, zero); + pixels[4] = _mm_unpacklo_epi16(x, bp); + pixels[5] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab1, zero); + pixels[6] = _mm_unpacklo_epi16(x, bp); + pixels[7] = _mm_unpackhi_epi16(x, bp); +} + +static INLINE void load_weight_w32(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w32_0, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_0, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpacklo_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[4] = _mm_unpacklo_epi16(x, y); + weight_w[5] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[6] = _mm_unpacklo_epi16(x, y); + weight_w[7] = _mm_unpackhi_epi16(x, y); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + + weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]); + weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]); + weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]); + weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]); + } +} + +static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[8] : pixels[9]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + + int j; + __m128i s[8]; + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[10]); + + for (j = 0; j < 8; ++j) { + s[j] = _mm_madd_epi16(pixels[j], wh_sc); + s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j])); + s[j] = _mm_add_epi32(s[j], round); + s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale); + } + + for (j = 0; j < 8; j += 2) { + __m128i sum = _mm_packus_epi16(s[j], s[j + 1]); + sum = _mm_shuffle_epi8(sum, gat); + _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum); + } + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 16, pixels); + + __m128i wh[4], ww[8]; + load_weight_w32(sm_weight_arrays, 16, wh, ww); + + smooth_pred_32x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 32, pixels); + + __m128i wh[8], ww[8]; + load_weight_w32(sm_weight_arrays, 32, wh, ww); + + smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); +} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h index 4238e651b..26c5cfe59 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h @@ -18,17 +18,17 @@ #include "aom_dsp/x86/txfm_common_avx2.h" static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { -#if CONFIG_HIGHBITDEPTH - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); -#else - *in = _mm256_loadu_si256((const __m256i *)coeff); -#endif + if (sizeof(tran_low_t) == 4) { + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); + } else { + *in = _mm256_loadu_si256((const __m256i *)coeff); + } } static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h index 95d246c3c..342816977 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h @@ -133,12 +133,12 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { // Function to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data(const tran_low_t *data) { -#if CONFIG_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); -#else - return _mm_load_si128((const __m128i *)data); -#endif + if (sizeof(tran_low_t) == 4) { + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); + } else { + return _mm_load_si128((const __m128i *)data); + } } static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index 7e134dc63..8343dbbed 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -178,10 +178,20 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, #endif // !CONFIG_PARALLEL_DEBLOCKING FILTER4; +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 8); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0); + + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 8); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0); +#else _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +#endif } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -267,8 +277,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); +#if !CONFIG_PARALLEL_DEBLOCKING // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); +#endif // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); @@ -279,7 +291,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); ps1ps0 = _mm_srli_si128(ps1ps0, 4); *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - +#if !CONFIG_PARALLEL_DEBLOCKING *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); @@ -287,6 +299,19 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +#endif +} + +static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num, + uint8_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x); + const __m128i hi = _mm_srli_si128(*x, 8); + *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi); +#else + _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x); + _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x)); +#endif } void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, @@ -580,44 +605,37 @@ void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + store_buffer_horz_8(&q6p6, p, 6, s); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + store_buffer_horz_8(&q5p5, p, 5, s); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + store_buffer_horz_8(&q4p4, p, 4, s); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + store_buffer_horz_8(&q3p3, p, 3, s); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + store_buffer_horz_8(&q2p2, p, 2, s); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + store_buffer_horz_8(&q1p1, p, 1, s); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + store_buffer_horz_8(&q0p0, p, 0, s); } } @@ -651,10 +669,33 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput; + +static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x, + int p, int offset, uint8_t *s) { + int i; + if (pixel_num == FOUR_PIXELS) { + for (i = 13; i >= 0; i--) { + *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]); + } + } + if (pixel_num == EIGHT_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]); + } + } + if (pixel_num == SIXTEEN_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]); + } + } +} + +static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num, + unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -910,73 +951,62 @@ void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + __m128i x[14]; + x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + + store_buffer_horz_16(pixel_num, x, p, 6, s); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1186,15 +1216,35 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1); + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0); + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1); + *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2); +#else _mm_storel_epi64((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s - 2 * p), p1); _mm_storel_epi64((__m128i *)(s - 1 * p), p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q0); _mm_storel_epi64((__m128i *)(s + 1 * p), q1); _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +#endif } } +void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { +#if CONFIG_PARALLEL_DEBLOCKING + lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); +#else + lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); +#endif +} + void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 000000000..027c890dc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H +#define _AOM_DSP_X86_LPF_COMMON_X86_H + +#include <emmintrin.h> // SSE2 + +#include "./aom_config.h" + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} +#endif // _AOM_DSP_X86_LPF_COMMON_X86_H diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 6a73ac460..2536f91d2 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -98,7 +98,13 @@ MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) -#endif +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(32, 128) +MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -294,7 +300,13 @@ HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) -#endif +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(32, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index 24e7ed1c6..3ffe132be 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -131,7 +131,13 @@ MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) -#endif +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR_SSSE3(128, 32) +MASK_SUBPIX_VAR_SSSE3(32, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -712,6 +718,12 @@ HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32) +#endif #endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 3fd6f71e5..52dd508ec 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -142,6 +142,8 @@ OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) #endif //////////////////////////////////////////////////////////////////////////////// @@ -271,5 +273,7 @@ HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) #endif #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 44cfa8e28..392616af3 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -151,7 +151,13 @@ OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) -#endif +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +OBMCVARWXH(32, 128) +OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES //////////////////////////////////////////////////////////////////////////////// // High bit-depth @@ -364,5 +370,11 @@ HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) -#endif +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(32, 128) +HBD_OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 890c1f01e..0e7f679d0 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -16,29 +16,29 @@ #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif + if (sizeof(tran_low_t) == 4) { + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); + } else { + return _mm_load_si128((const __m128i *)coeff_ptr); + } } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif + if (sizeof(tran_low_t) == 4) { + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); + } else { + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); + } } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 4570e2ce6..2c67f450f 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -256,4 +256,6 @@ SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index 88d427077..b4cc6abf1 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -163,6 +163,10 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 +%endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -261,6 +265,8 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 %endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index 4f7a60c22..1a8fed710 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -15,6 +15,7 @@ #include <immintrin.h> #include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/common_avx2.h" #define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ @@ -34,135 +35,6 @@ static INLINE void mm256_reverse_epi16(__m256i *u) { *u = _mm256_permute2x128_si256(v, v, 1); } -// Note: in and out could have the same value -static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { - __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); - __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); - __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); - __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); - __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); - - __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); - __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); - __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); - __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); - __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); - __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); - __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); - __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); - - // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b - // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f - // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b - // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f - // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b - // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f - // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b - // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f - - // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b - // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f - // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb - // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf - // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db - // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df - // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb - // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff - - __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); - __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); - __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); - __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); - __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); - __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); - __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); - __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); - - __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); - __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); - __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); - __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); - __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); - __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); - __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); - __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); - - // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 - // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b - // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d - // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f - // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 - // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b - // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d - // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f - - // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 - // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb - // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd - // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf - // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 - // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb - // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd - // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff - - tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - - tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); - tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); - tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); - tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); - tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); - tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); - tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); - tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); - - // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 - // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 - // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a - // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b - // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c - // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d - // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e - // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f - - // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 - // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 - // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa - // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb - // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc - // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd - // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe - // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - - out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); -} - static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, const __m256i *cospi) { const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h index e4ac56339..4e6eecd32 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h @@ -16,16 +16,16 @@ // This header file should be put below any x86 intrinsics head file static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); + } } #endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 918844185..211fad3f8 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -382,6 +382,28 @@ unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, assert(sum >= -255 * 32 * 8); return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); } + +unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 16 * 64); + assert(sum >= -255 * 16 * 64); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 16); + assert(sum >= -255 * 64 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} #endif // The 2 unused parameters are place holders for PIC enabled build. @@ -451,7 +473,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ @@ -543,7 +567,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ |