diff options
Diffstat (limited to 'third_party/aom/aom_dsp')
216 files changed, 0 insertions, 85133 deletions
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c deleted file mode 100644 index bfb3e7e00..000000000 --- a/third_party/aom/aom_dsp/add_noise.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <math.h> -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], - char whiteclamp[16], char bothclamp[16], - unsigned int width, unsigned int height, int pitch) { - unsigned int i, j; - - for (i = 0; i < height; ++i) { - uint8_t *pos = start + i * pitch; - char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT - - for (j = 0; j < width; ++j) { - int v = pos[j]; - - v = clamp(v - blackclamp[0], 0, 255); - v = clamp(v + bothclamp[0], 0, 255); - v = clamp(v - whiteclamp[0], 0, 255); - - pos[j] = v + ref[j]; - } - } -} - -static double gaussian(double sigma, double mu, double x) { - return 1 / (sigma * sqrt(2.0 * 3.14159265)) * - (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); -} - -int aom_setup_noise(double sigma, int size, char *noise) { - char char_dist[256]; - int next = 0, i, j; - - // set up a 256 entry lookup that matches gaussian distribution - for (i = -32; i < 32; ++i) { - const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); - if (a_i) { - for (j = 0; j < a_i; ++j) { - char_dist[next + j] = (char)i; - } - next = next + j; - } - } - - // Rounding error - might mean we have less than 256. - for (; next < 256; ++next) { - char_dist[next] = 0; - } - - for (i = 0; i < size; ++i) { - noise[i] = char_dist[rand() & 0xff]; // NOLINT - } - - // Returns the highest non 0 value used in distribution. - return -char_dist[0]; -} diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c deleted file mode 100644 index 4791826da..000000000 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <string.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { - int sum = 0; - for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; - return sum; -} - -static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride, - const int16_t *b) { - int sum = 0; - for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; - return sum; -} - -static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - src -= SUBPEL_TAPS / 2 - 1; - for (int y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (int x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - const int sum = horz_scalar_product(src_x, x_filter); - dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (int x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (int y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - const int sum = vert_scalar_product(src_y, src_stride, y_filter); - dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); -} - -void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); -} - -void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { - int r; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - for (r = h; r > 0; --r) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } -} - -static INLINE int highbd_vert_scalar_product(const uint16_t *a, - ptrdiff_t a_stride, - const int16_t *b) { - int sum = 0; - for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; - return sum; -} - -static INLINE int highbd_horz_scalar_product(const uint16_t *a, - const int16_t *b) { - int sum = 0; - for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; - return sum; -} - -static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= SUBPEL_TAPS / 2 - 1; - for (int y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (int x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - const int sum = highbd_horz_scalar_product(src_x, x_filter); - dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (int x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (int y = 0; y < h; ++y) { - const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter); - dst[y * dst_stride] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; - (void)y_step_q4; - - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; - (void)x_step_q4; - - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h, bd); -} - -void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { - int r; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - (void)bd; - - for (r = h; r > 0; --r) { - memcpy(dst, src, w * sizeof(uint16_t)); - src += src_stride; - dst += dst_stride; - } -} diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake deleted file mode 100644 index 11ff73756..000000000 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ /dev/null @@ -1,356 +0,0 @@ -# -# Copyright (c) 2017, Alliance for Open Media. All rights reserved -# -# This source code is subject to the terms of the BSD 2 Clause License and the -# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was -# not distributed with this source code in the LICENSE file, you can obtain it -# at www.aomedia.org/license/software. If the Alliance for Open Media Patent -# License 1.0 was not distributed with this source code in the PATENTS file, you -# can obtain it at www.aomedia.org/license/patent. -# -if(AOM_AOM_DSP_AOM_DSP_CMAKE_) - return() -endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_ -set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) - -list(APPEND AOM_DSP_COMMON_SOURCES - "${AOM_ROOT}/aom_dsp/aom_convolve.c" - "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" - "${AOM_ROOT}/aom_dsp/aom_filter.h" - "${AOM_ROOT}/aom_dsp/aom_simd.h" - "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" - "${AOM_ROOT}/aom_dsp/blend.h" - "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" - "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" - "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" - "${AOM_ROOT}/aom_dsp/entcode.c" - "${AOM_ROOT}/aom_dsp/entcode.h" - "${AOM_ROOT}/aom_dsp/fft.c" - "${AOM_ROOT}/aom_dsp/fft_common.h" - "${AOM_ROOT}/aom_dsp/intrapred.c" - "${AOM_ROOT}/aom_dsp/intrapred_common.h" - "${AOM_ROOT}/aom_dsp/loopfilter.c" - "${AOM_ROOT}/aom_dsp/prob.h" - "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/subtract.c" - "${AOM_ROOT}/aom_dsp/txfm_common.h" - "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h") - -list(APPEND AOM_DSP_COMMON_ASM_SSE2 - "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") - -list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 - "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" - "${AOM_ROOT}/aom_dsp/x86/convolve.h" - "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h") - -list(APPEND AOM_DSP_COMMON_ASM_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm") - -list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") - -list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 - "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") - -list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c") - -list(APPEND AOM_DSP_COMMON_INTRIN_NEON - "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" - "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c") - -list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2 - "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") - -list(APPEND AOM_DSP_COMMON_INTRIN_MSA - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" - "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" - "${AOM_ROOT}/aom_dsp/mips/macros_msa.h") - -if(CONFIG_AV1_DECODER) - list(APPEND AOM_DSP_DECODER_SOURCES - "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" - "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" - "${AOM_ROOT}/aom_dsp/bitreader.h" - "${AOM_ROOT}/aom_dsp/daalaboolreader.c" - "${AOM_ROOT}/aom_dsp/daalaboolreader.h" - "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h" - "${AOM_ROOT}/aom_dsp/grain_synthesis.c" - "${AOM_ROOT}/aom_dsp/grain_synthesis.h") -endif() - -if(CONFIG_AV1_ENCODER) - list(APPEND AOM_DSP_ENCODER_SOURCES - "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" - "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" - "${AOM_ROOT}/aom_dsp/bitwriter.h" - "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" - "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" - "${AOM_ROOT}/aom_dsp/entenc.c" - "${AOM_ROOT}/aom_dsp/entenc.h" - "${AOM_ROOT}/aom_dsp/fwd_txfm.c" - "${AOM_ROOT}/aom_dsp/grain_table.c" - "${AOM_ROOT}/aom_dsp/grain_table.h" - "${AOM_ROOT}/aom_dsp/noise_model.c" - "${AOM_ROOT}/aom_dsp/noise_model.h" - "${AOM_ROOT}/aom_dsp/noise_util.c" - "${AOM_ROOT}/aom_dsp/noise_util.h" - "${AOM_ROOT}/aom_dsp/psnr.c" - "${AOM_ROOT}/aom_dsp/psnr.h" - "${AOM_ROOT}/aom_dsp/quantize.c" - "${AOM_ROOT}/aom_dsp/quantize.h" - "${AOM_ROOT}/aom_dsp/sad.c" - "${AOM_ROOT}/aom_dsp/sse.c" - "${AOM_ROOT}/aom_dsp/sad_av1.c" - "${AOM_ROOT}/aom_dsp/sum_squares.c" - "${AOM_ROOT}/aom_dsp/variance.c" - "${AOM_ROOT}/aom_dsp/variance.h") - - list(APPEND AOM_DSP_ENCODER_ASM_SSE2 - "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") - - list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" - "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") - - list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" - "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") - - list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") - - list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 - "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") - - list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64 - "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") - - list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" - "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" - "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") - - list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") - - list(APPEND AOM_DSP_ENCODER_INTRIN_NEON - "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" - "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" - "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" - "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") - - list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" - "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" - "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" - "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") - - if(CONFIG_INTERNAL_STATS) - list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c" - "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c" - "${AOM_ROOT}/aom_dsp/ssim.h") - endif() -endif() - -# Creates aom_dsp build targets. Must not be called until after libaom target -# has been created. -function(setup_aom_dsp_targets) - add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) - list(APPEND AOM_LIB_TARGETS aom_dsp_common) - create_dummy_source_file("aom_av1" "c" "dummy_source_file") - add_library(aom_dsp OBJECT "${dummy_source_file}") - target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>) - list(APPEND AOM_LIB_TARGETS aom_dsp) - - # Not all generators support libraries consisting only of object files. Add a - # dummy source file to the aom_dsp target. - add_dummy_source_file_to_target("aom_dsp" "c") - - if(CONFIG_AV1_DECODER) - add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) - list(APPEND AOM_LIB_TARGETS aom_dsp_decoder) - target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>) - endif() - - if(CONFIG_AV1_ENCODER) - add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) - list(APPEND AOM_LIB_TARGETS aom_dsp_encoder) - target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>) - endif() - - if(HAVE_SSE2) - add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") - add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSE2" "aom") - - if(CONFIG_AV1_ENCODER) - add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") - add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSE2" "aom") - endif() - endif() - - if(HAVE_SSSE3) - add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom") - add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSSE3" "aom") - - if(CONFIG_AV1_ENCODER) - if("${AOM_TARGET_CPU}" STREQUAL "x86_64") - list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 - ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) - endif() - add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") - add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom") - endif() - endif() - - if(HAVE_SSE4_1) - add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom") - if(CONFIG_AV1_ENCODER) - add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom") - endif() - endif() - - if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") - if(CONFIG_AV1_ENCODER) - add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" - "aom") - endif() - endif() - - if(HAVE_AVX2) - add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_AVX2" "aom") - if(CONFIG_AV1_ENCODER) - add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_AVX2" "aom") - endif() - endif() - - if(HAVE_NEON) - add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" - "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON" - "aom") - if(CONFIG_AV1_ENCODER) - add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" - "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_NEON" "aom") - endif() - endif() - - if(HAVE_DSPR2) - add_intrinsics_object_library("" "dspr2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_DSPR2" "aom") - endif() - - if(HAVE_MSA) - add_intrinsics_object_library("" "msa" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_MSA" "aom") - if(CONFIG_AV1_ENCODER) - add_intrinsics_object_library("" "msa" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_MSA" "aom") - endif() - endif() - - # Pass the new lib targets up to the parent scope instance of - # $AOM_LIB_TARGETS. - set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) -endfunction() diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h deleted file mode 100644 index a185b23c8..000000000 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ -#define AOM_AOM_DSP_AOM_DSP_COMMON_H_ - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef MAX_SB_SIZE -#define MAX_SB_SIZE 128 -#endif // ndef MAX_SB_SIZE - -#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) -#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) - -#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') - -#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) - -/* Left shifting a negative value became undefined behavior in C99 (downgraded - from merely implementation-defined in C89). This should still compile to the - correct thing on any two's-complement machine, but avoid ubsan warnings.*/ -#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) - -// These can be used to give a hint about branch outcomes. -// This can have an effect, even if your target processor has a -// good branch predictor, as these hints can affect basic block -// ordering by the compiler. -#ifdef __GNUC__ -#define LIKELY(v) __builtin_expect(v, 1) -#define UNLIKELY(v) __builtin_expect(v, 0) -#else -#define LIKELY(v) (v) -#define UNLIKELY(v) (v) -#endif - -typedef uint8_t qm_val_t; -#define AOM_QM_BITS 5 - -// Note: -// tran_low_t is the datatype used for final transform coefficients. -// tran_high_t is the datatype used for intermediate transform stages. -typedef int64_t tran_high_t; -typedef int32_t tran_low_t; - -static INLINE uint8_t clip_pixel(int val) { - return (val > 255) ? 255 : (val < 0) ? 0 : val; -} - -static INLINE int clamp(int value, int low, int high) { - return value < low ? low : (value > high ? high : value); -} - -static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { - return value < low ? low : (value > high ? high : value); -} - -static INLINE double fclamp(double value, double low, double high) { - return value < low ? low : (value > high ? high : value); -} - -static INLINE uint16_t clip_pixel_highbd(int val, int bd) { - switch (bd) { - case 8: - default: return (uint16_t)clamp(val, 0, 255); - case 10: return (uint16_t)clamp(val, 0, 1023); - case 12: return (uint16_t)clamp(val, 0, 4095); - } -} - -// The result of this branchless code is equivalent to (value < 0 ? 0 : value) -// or max(0, value) and might be faster in some cases. -// Care should be taken since the behavior of right shifting signed type -// negative value is undefined by C standards and implementation defined, -static INLINE unsigned int negative_to_zero(int value) { - return value & ~(value >> (sizeof(value) * 8 - 1)); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c deleted file mode 100644 index 1514bd64e..000000000 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include "config/aom_config.h" - -#define RTCD_C -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/aom_once.h" - -void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl deleted file mode 100755 index 8e8a480fe..000000000 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ /dev/null @@ -1,1575 +0,0 @@ -## -## Copyright (c) 2017, Alliance for Open Media. All rights reserved -## -## This source code is subject to the terms of the BSD 2 Clause License and -## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -## was not distributed with this source code in the LICENSE file, you can -## obtain it at www.aomedia.org/license/software. If the Alliance for Open -## Media Patent License 1.0 was not distributed with this source code in the -## PATENTS file, you can obtain it at www.aomedia.org/license/patent. -## -sub aom_dsp_forward_decls() { -print <<EOF -/* - * DSP - */ - -#include "aom/aom_integer.h" -#include "aom_dsp/aom_dsp_common.h" -#include "av1/common/enums.h" -#include "av1/common/blockd.h" - -EOF -} -forward_decls qw/aom_dsp_forward_decls/; - -# optimizations which depend on multiple features -$avx2_ssse3 = ''; -if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) { - $avx2_ssse3 = 'avx2'; -} - -# functions that are 64 bit only. -$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; -if ($opts{arch} eq "x86_64") { - $mmx_x86_64 = 'mmx'; - $sse2_x86_64 = 'sse2'; - $ssse3_x86_64 = 'ssse3'; - $avx_x86_64 = 'avx'; - $avx2_x86_64 = 'avx2'; -} - -@block_widths = (4, 8, 16, 32, 64, 128); - -@block_sizes = (); -foreach $w (@block_widths) { - foreach $h (@block_widths) { - push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ; - } -} -push @block_sizes, [4, 16]; -push @block_sizes, [16, 4]; -push @block_sizes, [8, 32]; -push @block_sizes, [32, 8]; -push @block_sizes, [16, 64]; -push @block_sizes, [64, 16]; - -@tx_dims = (2, 4, 8, 16, 32, 64); -@tx_sizes = (); -foreach $w (@tx_dims) { - push @tx_sizes, [$w, $w]; - foreach $h (@tx_dims) { - push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); - push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w)); - } -} - -@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/; - -# -# Intra prediction -# - -foreach (@tx_sizes) { - ($w, $h) = @$_; - foreach $pred_name (@pred_names) { - add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", - "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; - add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", - "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - } -} - -specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/; -specialize qw/aom_dc_top_predictor_4x8 sse2/; -specialize qw/aom_dc_top_predictor_4x16 sse2/; -specialize qw/aom_dc_top_predictor_8x4 sse2/; -specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_top_predictor_8x16 sse2/; -specialize qw/aom_dc_top_predictor_8x32 sse2/; -specialize qw/aom_dc_top_predictor_16x4 sse2/; -specialize qw/aom_dc_top_predictor_16x8 sse2/; -specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_top_predictor_16x32 sse2/; -specialize qw/aom_dc_top_predictor_16x64 sse2/; -specialize qw/aom_dc_top_predictor_32x8 sse2/; -specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; -specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; -specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/; -specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/; -specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/; -specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/; -specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; -specialize qw/aom_dc_left_predictor_4x8 sse2/; -specialize qw/aom_dc_left_predictor_4x16 sse2/; -specialize qw/aom_dc_left_predictor_8x4 sse2/; -specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_left_predictor_8x16 sse2/; -specialize qw/aom_dc_left_predictor_8x32 sse2/; -specialize qw/aom_dc_left_predictor_16x4 sse2/; -specialize qw/aom_dc_left_predictor_16x8 sse2/; -specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_left_predictor_16x32 sse2/; -specialize qw/aom_dc_left_predictor_16x64 sse2/; -specialize qw/aom_dc_left_predictor_32x8 sse2/; -specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/; -specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/; -specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/; -specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/; -specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/; -specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/; -specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/; -specialize qw/aom_dc_128_predictor_4x8 sse2/; -specialize qw/aom_dc_128_predictor_4x16 sse2/; -specialize qw/aom_dc_128_predictor_8x4 sse2/; -specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_128_predictor_8x16 sse2/; -specialize qw/aom_dc_128_predictor_8x32 sse2/; -specialize qw/aom_dc_128_predictor_16x4 sse2/; -specialize qw/aom_dc_128_predictor_16x8 sse2/; -specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_128_predictor_16x32 sse2/; -specialize qw/aom_dc_128_predictor_16x64 sse2/; -specialize qw/aom_dc_128_predictor_32x8 sse2/; -specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/; -specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/; -specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/; -specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/; -specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/; -specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/; -specialize qw/aom_v_predictor_4x4 neon msa sse2/; -specialize qw/aom_v_predictor_4x8 sse2/; -specialize qw/aom_v_predictor_4x16 sse2/; -specialize qw/aom_v_predictor_8x4 sse2/; -specialize qw/aom_v_predictor_8x8 neon msa sse2/; -specialize qw/aom_v_predictor_8x16 sse2/; -specialize qw/aom_v_predictor_8x32 sse2/; -specialize qw/aom_v_predictor_16x4 sse2/; -specialize qw/aom_v_predictor_16x8 sse2/; -specialize qw/aom_v_predictor_16x16 neon msa sse2/; -specialize qw/aom_v_predictor_16x32 sse2/; -specialize qw/aom_v_predictor_16x64 sse2/; -specialize qw/aom_v_predictor_32x8 sse2/; -specialize qw/aom_v_predictor_32x16 sse2 avx2/; -specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/; -specialize qw/aom_v_predictor_32x64 sse2 avx2/; -specialize qw/aom_v_predictor_64x64 sse2 avx2/; -specialize qw/aom_v_predictor_64x32 sse2 avx2/; -specialize qw/aom_v_predictor_64x16 sse2 avx2/; -specialize qw/aom_h_predictor_4x8 sse2/; -specialize qw/aom_h_predictor_4x16 sse2/; -specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; -specialize qw/aom_h_predictor_8x4 sse2/; -specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/; -specialize qw/aom_h_predictor_8x16 sse2/; -specialize qw/aom_h_predictor_8x32 sse2/; -specialize qw/aom_h_predictor_16x4 sse2/; -specialize qw/aom_h_predictor_16x8 sse2/; -specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; -specialize qw/aom_h_predictor_16x32 sse2/; -specialize qw/aom_h_predictor_16x64 sse2/; -specialize qw/aom_h_predictor_32x8 sse2/; -specialize qw/aom_h_predictor_32x16 sse2/; -specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/; -specialize qw/aom_h_predictor_32x64 sse2/; -specialize qw/aom_h_predictor_64x64 sse2/; -specialize qw/aom_h_predictor_64x32 sse2/; -specialize qw/aom_h_predictor_64x16 sse2/; -specialize qw/aom_paeth_predictor_4x4 ssse3/; -specialize qw/aom_paeth_predictor_4x8 ssse3/; -specialize qw/aom_paeth_predictor_4x16 ssse3/; -specialize qw/aom_paeth_predictor_8x4 ssse3/; -specialize qw/aom_paeth_predictor_8x8 ssse3/; -specialize qw/aom_paeth_predictor_8x16 ssse3/; -specialize qw/aom_paeth_predictor_8x32 ssse3/; -specialize qw/aom_paeth_predictor_16x4 ssse3/; -specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/; -specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/; -specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/; -specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/; -specialize qw/aom_paeth_predictor_32x8 ssse3/; -specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/; -specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/; -specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/; -specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/; -specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/; -specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/; -specialize qw/aom_paeth_predictor_16x8 ssse3/; -specialize qw/aom_paeth_predictor_16x16 ssse3/; -specialize qw/aom_paeth_predictor_16x32 ssse3/; -specialize qw/aom_paeth_predictor_32x16 ssse3/; -specialize qw/aom_paeth_predictor_32x32 ssse3/; -specialize qw/aom_smooth_predictor_4x4 ssse3/; -specialize qw/aom_smooth_predictor_4x8 ssse3/; -specialize qw/aom_smooth_predictor_4x16 ssse3/; -specialize qw/aom_smooth_predictor_8x4 ssse3/; -specialize qw/aom_smooth_predictor_8x8 ssse3/; -specialize qw/aom_smooth_predictor_8x16 ssse3/; -specialize qw/aom_smooth_predictor_8x32 ssse3/; -specialize qw/aom_smooth_predictor_16x4 ssse3/; -specialize qw/aom_smooth_predictor_16x8 ssse3/; -specialize qw/aom_smooth_predictor_16x16 ssse3/; -specialize qw/aom_smooth_predictor_16x32 ssse3/; -specialize qw/aom_smooth_predictor_16x64 ssse3/; -specialize qw/aom_smooth_predictor_32x8 ssse3/; -specialize qw/aom_smooth_predictor_32x16 ssse3/; -specialize qw/aom_smooth_predictor_32x32 ssse3/; -specialize qw/aom_smooth_predictor_32x64 ssse3/; -specialize qw/aom_smooth_predictor_64x64 ssse3/; -specialize qw/aom_smooth_predictor_64x32 ssse3/; -specialize qw/aom_smooth_predictor_64x16 ssse3/; - -specialize qw/aom_smooth_v_predictor_4x4 ssse3/; -specialize qw/aom_smooth_v_predictor_4x8 ssse3/; -specialize qw/aom_smooth_v_predictor_4x16 ssse3/; -specialize qw/aom_smooth_v_predictor_8x4 ssse3/; -specialize qw/aom_smooth_v_predictor_8x8 ssse3/; -specialize qw/aom_smooth_v_predictor_8x16 ssse3/; -specialize qw/aom_smooth_v_predictor_8x32 ssse3/; -specialize qw/aom_smooth_v_predictor_16x4 ssse3/; -specialize qw/aom_smooth_v_predictor_16x8 ssse3/; -specialize qw/aom_smooth_v_predictor_16x16 ssse3/; -specialize qw/aom_smooth_v_predictor_16x32 ssse3/; -specialize qw/aom_smooth_v_predictor_16x64 ssse3/; -specialize qw/aom_smooth_v_predictor_32x8 ssse3/; -specialize qw/aom_smooth_v_predictor_32x16 ssse3/; -specialize qw/aom_smooth_v_predictor_32x32 ssse3/; -specialize qw/aom_smooth_v_predictor_32x64 ssse3/; -specialize qw/aom_smooth_v_predictor_64x64 ssse3/; -specialize qw/aom_smooth_v_predictor_64x32 ssse3/; -specialize qw/aom_smooth_v_predictor_64x16 ssse3/; - -specialize qw/aom_smooth_h_predictor_4x4 ssse3/; -specialize qw/aom_smooth_h_predictor_4x8 ssse3/; -specialize qw/aom_smooth_h_predictor_4x16 ssse3/; -specialize qw/aom_smooth_h_predictor_8x4 ssse3/; -specialize qw/aom_smooth_h_predictor_8x8 ssse3/; -specialize qw/aom_smooth_h_predictor_8x16 ssse3/; -specialize qw/aom_smooth_h_predictor_8x32 ssse3/; -specialize qw/aom_smooth_h_predictor_16x4 ssse3/; -specialize qw/aom_smooth_h_predictor_16x8 ssse3/; -specialize qw/aom_smooth_h_predictor_16x16 ssse3/; -specialize qw/aom_smooth_h_predictor_16x32 ssse3/; -specialize qw/aom_smooth_h_predictor_16x64 ssse3/; -specialize qw/aom_smooth_h_predictor_32x8 ssse3/; -specialize qw/aom_smooth_h_predictor_32x16 ssse3/; -specialize qw/aom_smooth_h_predictor_32x32 ssse3/; -specialize qw/aom_smooth_h_predictor_32x64 ssse3/; -specialize qw/aom_smooth_h_predictor_64x64 ssse3/; -specialize qw/aom_smooth_h_predictor_64x32 ssse3/; -specialize qw/aom_smooth_h_predictor_64x16 ssse3/; - -# TODO(yunqingwang): optimize rectangular DC_PRED to replace division -# by multiply and shift. -specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; -specialize qw/aom_dc_predictor_4x8 sse2/; -specialize qw/aom_dc_predictor_4x16 sse2/; -specialize qw/aom_dc_predictor_8x4 sse2/; -specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; -specialize qw/aom_dc_predictor_8x16 sse2/; -specialize qw/aom_dc_predictor_8x32 sse2/; -specialize qw/aom_dc_predictor_16x4 sse2/; -specialize qw/aom_dc_predictor_16x8 sse2/; -specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; -specialize qw/aom_dc_predictor_16x32 sse2/; -specialize qw/aom_dc_predictor_16x64 sse2/; -specialize qw/aom_dc_predictor_32x8 sse2/; -specialize qw/aom_dc_predictor_32x16 sse2 avx2/; -specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; -specialize qw/aom_dc_predictor_32x64 sse2 avx2/; -specialize qw/aom_dc_predictor_64x64 sse2 avx2/; -specialize qw/aom_dc_predictor_64x32 sse2 avx2/; -specialize qw/aom_dc_predictor_64x16 sse2 avx2/; - - specialize qw/aom_highbd_v_predictor_4x4 sse2/; - specialize qw/aom_highbd_v_predictor_4x8 sse2/; - specialize qw/aom_highbd_v_predictor_8x4 sse2/; - specialize qw/aom_highbd_v_predictor_8x8 sse2/; - specialize qw/aom_highbd_v_predictor_8x16 sse2/; - specialize qw/aom_highbd_v_predictor_16x8 sse2/; - specialize qw/aom_highbd_v_predictor_16x16 sse2/; - specialize qw/aom_highbd_v_predictor_16x32 sse2/; - specialize qw/aom_highbd_v_predictor_32x16 sse2/; - specialize qw/aom_highbd_v_predictor_32x32 sse2/; - - # TODO(yunqingwang): optimize rectangular DC_PRED to replace division - # by multiply and shift. - specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/; - specialize qw/aom_highbd_dc_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; - specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;; - specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; - specialize qw/aom_highbd_dc_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/; - specialize qw/aom_highbd_dc_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/; - specialize qw/aom_highbd_dc_predictor_64x64 neon/; - - specialize qw/aom_highbd_h_predictor_4x4 sse2/; - specialize qw/aom_highbd_h_predictor_4x8 sse2/; - specialize qw/aom_highbd_h_predictor_8x4 sse2/; - specialize qw/aom_highbd_h_predictor_8x8 sse2/; - specialize qw/aom_highbd_h_predictor_8x16 sse2/; - specialize qw/aom_highbd_h_predictor_16x8 sse2/; - specialize qw/aom_highbd_h_predictor_16x16 sse2/; - specialize qw/aom_highbd_h_predictor_16x32 sse2/; - specialize qw/aom_highbd_h_predictor_32x16 sse2/; - specialize qw/aom_highbd_h_predictor_32x32 sse2/; - specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/; - specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; - specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/; - specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/; - specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/; - specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/; - specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/; - specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/; - specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/; - specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; - specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; - specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; - -# -# Sub Pixel Filters -# -add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - -specialize qw/aom_convolve_copy sse2 /; -specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3"; -specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3"; - -add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; -specialize qw/aom_highbd_convolve_copy sse2 avx2/; - -add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; -specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64"; - -add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; -specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64"; - -# -# Loopfilter -# -add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_14 sse2 neon/; - -add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_14_dual sse2/; - -add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_6 sse2 neon/; - -add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_8 sse2 neon/; - -add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_8_dual sse2/; - -add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_4 sse2 neon/; - -add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_4_dual sse2/; - -add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_14 sse2 neon/; - -add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_14_dual sse2/; - -add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_6 sse2 neon/; - -add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_6_dual sse2/; - -add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_8 sse2 neon/; - -add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_8_dual sse2/; - -add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_4 sse2 neon/; - -add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_4_dual sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_vertical_14 sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/; - -add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_vertical_8 sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_vertical_6 sse2/; - -add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_6_dual sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_vertical_6_dual sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; - -add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_vertical_4 sse2/; - -add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; - -add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_horizontal_14 sse2/; - -add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd"; -specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/; - -add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_horizontal_6 sse2/; - -add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/; - -add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_horizontal_8 sse2/; - -add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; - -add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; -specialize qw/aom_highbd_lpf_horizontal_4 sse2/; - -add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; -specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; - -# Helper functions. -add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; -specialize "av1_round_shift_array", qw/sse4_1 neon/; - -# -# Encoder functions. -# - -# -# Forward transform -# -if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ - add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64"; - - # High bit depth - add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_highbd_fdct8x8 sse2/; - - # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation) - add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output"; - - add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_fft4x4_float sse2/; - - add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_fft8x8_float avx2 sse2/; - - add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_fft16x16_float avx2 sse2/; - - add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_fft32x32_float avx2 sse2/; - - add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output"; - - add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_ifft4x4_float sse2/; - - add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_ifft8x8_float avx2 sse2/; - - add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_ifft16x16_float avx2 sse2/; - - add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output"; - specialize qw/aom_ifft32x32_float avx2 sse2/; -} # CONFIG_AV1_ENCODER - -# -# Quantization -# -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; - - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; - - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -} # CONFIG_AV1_ENCODER - -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b sse2 avx2/; - - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b_32x32 sse2/; - - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - -} # CONFIG_AV1_ENCODER - -# -# Alpha blending with mask -# -add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params"; -specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; -add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd"; -add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; -add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; -add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; -specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; -specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; -specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; - -add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd"; -add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; -add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; -specialize "aom_highbd_blend_a64_mask", qw/sse4_1/; -specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/; -specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/; - -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - # - # Block subtraction - # - add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; - specialize qw/aom_subtract_block neon msa sse2 avx2/; - - add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/aom_highbd_subtract_block sse2/; - - add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; - specialize qw/aom_sse sse4_1 avx2/; - - add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; - specialize qw/aom_highbd_sse sse4_1 avx2/; - - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - # - # Sum of Squares - # - add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; - specialize qw/aom_sum_squares_2d_i16 sse2 avx2/; - - add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; - specialize qw/aom_sum_squares_i16 sse2/; - - } - - - # - # Single block SAD / Single block Avg SAD - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; - } - - specialize qw/aom_sad128x128 avx2 sse2/; - specialize qw/aom_sad128x64 avx2 sse2/; - specialize qw/aom_sad64x128 avx2 sse2/; - specialize qw/aom_sad64x64 avx2 neon msa sse2/; - specialize qw/aom_sad64x32 avx2 msa sse2/; - specialize qw/aom_sad32x64 avx2 msa sse2/; - specialize qw/aom_sad32x32 avx2 neon msa sse2/; - specialize qw/aom_sad32x16 avx2 msa sse2/; - specialize qw/aom_sad16x32 msa sse2/; - specialize qw/aom_sad16x16 neon msa sse2/; - specialize qw/aom_sad16x8 neon msa sse2/; - specialize qw/aom_sad8x16 neon msa sse2/; - specialize qw/aom_sad8x8 neon msa sse2/; - specialize qw/aom_sad8x4 msa sse2/; - specialize qw/aom_sad4x8 msa sse2/; - specialize qw/aom_sad4x4 neon msa sse2/; - - specialize qw/aom_sad128x128_avg avx2 sse2/; - specialize qw/aom_sad128x64_avg avx2 sse2/; - specialize qw/aom_sad64x128_avg avx2 sse2/; - specialize qw/aom_sad64x64_avg avx2 msa sse2/; - specialize qw/aom_sad64x32_avg avx2 msa sse2/; - specialize qw/aom_sad32x64_avg avx2 msa sse2/; - specialize qw/aom_sad32x32_avg avx2 msa sse2/; - specialize qw/aom_sad32x16_avg avx2 msa sse2/; - specialize qw/aom_sad16x32_avg msa sse2/; - specialize qw/aom_sad16x16_avg msa sse2/; - specialize qw/aom_sad16x8_avg msa sse2/; - specialize qw/aom_sad8x16_avg msa sse2/; - specialize qw/aom_sad8x8_avg msa sse2/; - specialize qw/aom_sad8x4_avg msa sse2/; - specialize qw/aom_sad4x8_avg msa sse2/; - specialize qw/aom_sad4x4_avg msa sse2/; - - specialize qw/aom_sad4x16 sse2/; - specialize qw/aom_sad16x4 sse2/; - specialize qw/aom_sad8x32 sse2/; - specialize qw/aom_sad32x8 sse2/; - specialize qw/aom_sad16x64 sse2/; - specialize qw/aom_sad64x16 sse2/; - - specialize qw/aom_sad4x16_avg sse2/; - specialize qw/aom_sad16x4_avg sse2/; - specialize qw/aom_sad8x32_avg sse2/; - specialize qw/aom_sad32x8_avg sse2/; - specialize qw/aom_sad16x64_avg sse2/; - specialize qw/aom_sad64x16_avg sse2/; - - specialize qw/aom_jnt_sad128x128_avg ssse3/; - specialize qw/aom_jnt_sad128x64_avg ssse3/; - specialize qw/aom_jnt_sad64x128_avg ssse3/; - specialize qw/aom_jnt_sad64x64_avg ssse3/; - specialize qw/aom_jnt_sad64x32_avg ssse3/; - specialize qw/aom_jnt_sad32x64_avg ssse3/; - specialize qw/aom_jnt_sad32x32_avg ssse3/; - specialize qw/aom_jnt_sad32x16_avg ssse3/; - specialize qw/aom_jnt_sad16x32_avg ssse3/; - specialize qw/aom_jnt_sad16x16_avg ssse3/; - specialize qw/aom_jnt_sad16x8_avg ssse3/; - specialize qw/aom_jnt_sad8x16_avg ssse3/; - specialize qw/aom_jnt_sad8x8_avg ssse3/; - specialize qw/aom_jnt_sad8x4_avg ssse3/; - specialize qw/aom_jnt_sad4x8_avg ssse3/; - specialize qw/aom_jnt_sad4x4_avg ssse3/; - - specialize qw/aom_jnt_sad4x16_avg ssse3/; - specialize qw/aom_jnt_sad16x4_avg ssse3/; - specialize qw/aom_jnt_sad8x32_avg ssse3/; - specialize qw/aom_jnt_sad32x8_avg ssse3/; - specialize qw/aom_jnt_sad16x64_avg ssse3/; - specialize qw/aom_jnt_sad64x16_avg ssse3/; - - add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; - - specialize qw/aom_sad4xh sse2/; - specialize qw/aom_sad8xh sse2/; - specialize qw/aom_sad16xh sse2/; - specialize qw/aom_sad32xh sse2/; - specialize qw/aom_sad64xh sse2/; - specialize qw/aom_sad128xh sse2/; - - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_sad${w}x${h}", qw/sse2/; - specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; - } - add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; - } - specialize qw/aom_highbd_sad128x128 avx2/; - specialize qw/aom_highbd_sad128x64 avx2/; - specialize qw/aom_highbd_sad64x128 avx2/; - specialize qw/aom_highbd_sad64x64 avx2 sse2/; - specialize qw/aom_highbd_sad64x32 avx2 sse2/; - specialize qw/aom_highbd_sad32x64 avx2 sse2/; - specialize qw/aom_highbd_sad32x32 avx2 sse2/; - specialize qw/aom_highbd_sad32x16 avx2 sse2/; - specialize qw/aom_highbd_sad16x32 avx2 sse2/; - specialize qw/aom_highbd_sad16x16 avx2 sse2/; - specialize qw/aom_highbd_sad16x8 avx2 sse2/; - specialize qw/aom_highbd_sad8x4 sse2/; - - specialize qw/aom_highbd_sad128x128_avg avx2/; - specialize qw/aom_highbd_sad128x64_avg avx2/; - specialize qw/aom_highbd_sad64x128_avg avx2/; - specialize qw/aom_highbd_sad64x64_avg avx2 sse2/; - specialize qw/aom_highbd_sad64x32_avg avx2 sse2/; - specialize qw/aom_highbd_sad32x64_avg avx2 sse2/; - specialize qw/aom_highbd_sad32x32_avg avx2 sse2/; - specialize qw/aom_highbd_sad32x16_avg avx2 sse2/; - specialize qw/aom_highbd_sad16x32_avg avx2 sse2/; - specialize qw/aom_highbd_sad16x16_avg avx2 sse2/; - specialize qw/aom_highbd_sad16x8_avg avx2 sse2/; - specialize qw/aom_highbd_sad8x4_avg sse2/; - - specialize qw/aom_highbd_sad16x4 sse2/; - specialize qw/aom_highbd_sad8x32 sse2/; - specialize qw/aom_highbd_sad32x8 sse2/; - specialize qw/aom_highbd_sad16x64 sse2/; - specialize qw/aom_highbd_sad64x16 sse2/; - - specialize qw/aom_highbd_sad16x4_avg sse2/; - specialize qw/aom_highbd_sad8x32_avg sse2/; - specialize qw/aom_highbd_sad32x8_avg sse2/; - specialize qw/aom_highbd_sad16x64_avg sse2/; - specialize qw/aom_highbd_sad64x16_avg sse2/; - - # - # Masked SAD - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/; - } - - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/; - } - - - # - # OBMC SAD - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/; - } - } - - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/; - } - } - - - # - # Multi-block SAD, comparing a reference to N independent blocks - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; - } - - specialize qw/aom_sad128x128x4d avx2 sse2/; - specialize qw/aom_sad128x64x4d avx2 sse2/; - specialize qw/aom_sad64x128x4d avx2 sse2/; - specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; - specialize qw/aom_sad64x32x4d avx2 msa sse2/; - specialize qw/aom_sad32x64x4d avx2 msa sse2/; - specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; - specialize qw/aom_sad32x16x4d msa sse2/; - specialize qw/aom_sad16x32x4d msa sse2/; - specialize qw/aom_sad16x16x4d neon msa sse2/; - specialize qw/aom_sad16x8x4d msa sse2/; - specialize qw/aom_sad8x16x4d msa sse2/; - specialize qw/aom_sad8x8x4d msa sse2/; - specialize qw/aom_sad8x4x4d msa sse2/; - specialize qw/aom_sad4x8x4d msa sse2/; - specialize qw/aom_sad4x4x4d msa sse2/; - - specialize qw/aom_sad4x16x4d sse2/; - specialize qw/aom_sad16x4x4d sse2/; - specialize qw/aom_sad8x32x4d sse2/; - specialize qw/aom_sad32x8x4d sse2/; - specialize qw/aom_sad16x64x4d sse2/; - specialize qw/aom_sad64x16x4d sse2/; - - # - # Multi-block SAD, comparing a reference to N independent blocks - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; - if ($w != 128 && $h != 128) { - specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; - } - } - specialize qw/aom_highbd_sad128x128x4d avx2/; - specialize qw/aom_highbd_sad128x64x4d avx2/; - specialize qw/aom_highbd_sad64x128x4d avx2/; - specialize qw/aom_highbd_sad64x64x4d sse2 avx2/; - specialize qw/aom_highbd_sad64x32x4d sse2 avx2/; - specialize qw/aom_highbd_sad32x64x4d sse2 avx2/; - specialize qw/aom_highbd_sad32x32x4d sse2 avx2/; - specialize qw/aom_highbd_sad32x16x4d sse2 avx2/; - specialize qw/aom_highbd_sad16x32x4d sse2 avx2/; - specialize qw/aom_highbd_sad16x16x4d sse2 avx2/; - specialize qw/aom_highbd_sad16x8x4d sse2 avx2/; - specialize qw/aom_highbd_sad8x16x4d sse2/; - specialize qw/aom_highbd_sad8x8x4d sse2/; - specialize qw/aom_highbd_sad8x4x4d sse2/; - specialize qw/aom_highbd_sad4x8x4d sse2/; - specialize qw/aom_highbd_sad4x4x4d sse2/; - - specialize qw/aom_highbd_sad4x16x4d sse2/; - specialize qw/aom_highbd_sad16x4x4d sse2/; - specialize qw/aom_highbd_sad8x32x4d sse2/; - specialize qw/aom_highbd_sad32x8x4d sse2/; - specialize qw/aom_highbd_sad16x64x4d sse2/; - specialize qw/aom_highbd_sad64x16x4d sse2/; - - # - # Structured Similarity (SSIM) - # - if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { - add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; - - add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - - } -} # CONFIG_AV1_ENCODER - -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - - # - # Specialty Variance - # - add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - specialize qw/aom_get16x16var neon msa/; - specialize qw/aom_get8x8var neon msa/; - - - add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - - specialize qw/aom_mse16x16 sse2 avx2 neon msa/; - specialize qw/aom_mse16x8 sse2 msa/; - specialize qw/aom_mse8x16 sse2 msa/; - specialize qw/aom_mse8x8 sse2 msa/; - - foreach $bd (8, 10, 12) { - add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - - specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; - } - - - # - # - # - add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search"; - specialize qw/aom_upsampled_pred sse2/; - - add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, int subpel_search"; - specialize qw/aom_comp_avg_upsampled_pred sse2/; - - add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; - specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/; - - add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int subpel_search"; - specialize qw/aom_comp_mask_upsampled_pred sse2/; - - - add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; - specialize qw/aom_highbd_upsampled_pred sse2/; - - add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; - specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; - - add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; - specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/; - - - # - # - # - add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; - add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; - - specialize qw/aom_get_mb_ss sse2 msa/; - specialize qw/aom_get4x4sse_cs neon msa/; - - # - # Variance / Subpixel Variance / Subpixel Avg Variance - # - add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; - } - specialize qw/aom_variance128x128 sse2 avx2 /; - specialize qw/aom_variance128x64 sse2 avx2 /; - specialize qw/aom_variance64x128 sse2 avx2 /; - specialize qw/aom_variance64x64 sse2 avx2 neon msa/; - specialize qw/aom_variance64x32 sse2 avx2 neon msa/; - specialize qw/aom_variance32x64 sse2 avx2 neon msa/; - specialize qw/aom_variance32x32 sse2 avx2 neon msa/; - specialize qw/aom_variance32x16 sse2 avx2 msa/; - specialize qw/aom_variance16x32 sse2 avx2 msa/; - specialize qw/aom_variance16x16 sse2 avx2 neon msa/; - specialize qw/aom_variance16x8 sse2 avx2 neon msa/; - specialize qw/aom_variance8x16 sse2 neon msa/; - specialize qw/aom_variance8x8 sse2 neon msa/; - specialize qw/aom_variance8x4 sse2 msa/; - specialize qw/aom_variance4x8 sse2 msa/; - specialize qw/aom_variance4x4 sse2 msa/; - - specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; - - specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - - specialize qw/aom_variance4x16 sse2/; - specialize qw/aom_variance16x4 sse2 avx2/; - specialize qw/aom_variance8x32 sse2/; - specialize qw/aom_variance32x8 sse2 avx2/; - specialize qw/aom_variance16x64 sse2 avx2/; - specialize qw/aom_variance64x16 sse2 avx2/; - specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; - - specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x4 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance4x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance4x4 ssse3/; - - specialize qw/aom_jnt_sub_pixel_avg_variance4x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x4 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/; - - specialize qw/aom_jnt_sub_pixel_avg_variance128x128 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance128x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x128 ssse3/; - - - foreach $bd (8, 10, 12) { - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; - } - # TODO(david.barker): When ext-partition-types is enabled, we currently - # don't have vectorized 4x16 highbd variance functions - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; - } - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; - } - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; - } - - add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; - } - } - - # - # Masked Variance / Masked Subpixel Variance - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; - specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/; - } - - - foreach $bd ("_8_", "_10_", "_12_") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; - specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/; - } - } - - - # - # OBMC Variance / OBMC Subpixel Variance - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/; - specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; - } - - - foreach $bd ("_", "_10_", "_12_") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; - } - } - - - add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; - - add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - - # - # Comp Avg - # - add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - - add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; - specialize qw/aom_jnt_comp_avg_pred ssse3/; - - add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance128x128 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance128x64 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x128 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x64 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x32 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x64 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x32 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x16 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x32 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x16 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x8 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x16 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x8 sse2/; - - add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance128x128 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance128x64 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x128 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x64 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x32 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x64 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x32 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x16 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x32 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x16 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x8 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x16 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x8 sse2 avx2/; - - add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance128x128 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance128x64 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x128 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x64 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x32 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x64 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x32 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x16 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x32 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x16 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x8 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x16 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x8 sse2/; - - add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - - add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse16x16 sse2/; - - add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse8x8 sse2/; - - add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse16x16 sse2/; - - add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse8x8 sse2/; - - add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse16x16 sse2/; - - add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse8x8 sse2/; - - add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - - add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; - specialize qw/aom_highbd_jnt_comp_avg_pred sse2/; - - # - # Subpixel Variance - # - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; - - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - - - - add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - specialize qw/aom_comp_mask_pred ssse3 avx2/; - - add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - specialize qw/aom_highbd_comp_mask_pred sse2 avx2/; - -} # CONFIG_AV1_ENCODER - -1; diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h deleted file mode 100644 index 00686ac38..000000000 --- a/third_party/aom/aom_dsp/aom_filter.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_AOM_FILTER_H_ -#define AOM_AOM_DSP_AOM_FILTER_H_ - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define FILTER_BITS 7 - -#define SUBPEL_BITS 4 -#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) -#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) -#define SUBPEL_TAPS 8 - -#define SCALE_SUBPEL_BITS 10 -#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS) -#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1) -#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) -#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) - -#define RS_SUBPEL_BITS 6 -#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1) -#define RS_SCALE_SUBPEL_BITS 14 -#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1) -#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS) -#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1)) - -typedef int16_t InterpKernel[SUBPEL_TAPS]; - -#define BIL_SUBPEL_BITS 3 -#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) - -// 2 tap bilinear filters -static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_AOM_FILTER_H_ diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h deleted file mode 100644 index ab950ca55..000000000 --- a/third_party/aom/aom_dsp/aom_simd.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_AOM_SIMD_H_ -#define AOM_AOM_DSP_AOM_SIMD_H_ - -#include <stdint.h> - -#if defined(_WIN32) -#include <intrin.h> -#endif - -#include "config/aom_config.h" - -#include "aom_dsp/aom_simd_inline.h" - -#define SIMD_CHECK 1 // Sanity checks in C equivalents - -#if HAVE_NEON -#include "simd/v256_intrinsics_arm.h" -// VS compiling for 32 bit targets does not support vector types in -// structs as arguments, which makes the v256 type of the intrinsics -// hard to support, so optimizations for this target are disabled. -#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) -#include "simd/v256_intrinsics_x86.h" -#else -#include "simd/v256_intrinsics.h" -#endif - -#endif // AOM_AOM_DSP_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h deleted file mode 100644 index eb333f6f6..000000000 --- a/third_party/aom/aom_dsp/aom_simd_inline.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ -#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ - -#include "aom/aom_integer.h" - -#ifndef SIMD_INLINE -#define SIMD_INLINE static AOM_FORCE_INLINE -#endif - -#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c deleted file mode 100644 index e7f08a5fd..000000000 --- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c +++ /dev/null @@ -1,451 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" -#include "aom_ports/mem.h" -#include "av1/common/arm/mem_neon.h" -#include "config/aom_dsp_rtcd.h" - -static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1, - const int16x8_t v_maxval, int16x8_t *res) { - int32x4_t im_res_low, im_res_high; - const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask); - - im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0)); - im_res_low = - vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1)); - - im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0)); - im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask), - vget_high_s16(src_1)); - - *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS), - vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS)); -} - -static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride, - const CONV_BUF_TYPE *src0, uint32_t src0_stride, - const CONV_BUF_TYPE *src1, uint32_t src1_stride, - int16x8_t mask0, int16x8_t mask1, int16x8_t mask2, - int16x8_t mask3, const int16x8_t v_maxval, - const uint16x8_t vec_round_offset, - const int16x8_t vec_round_bits) { - int16x8_t src0_0, src0_1, src0_2, src0_3; - int16x8_t src1_0, src1_1, src1_2, src1_3; - int16x8_t im_res_0, im_res_1, im_res_2, im_res_3; - - load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2, - &src0_3); - load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2, - &src1_3); - - blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0); - blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1); - blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2); - blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3); - - uint16x8_t im_res1_0 = - vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset); - uint16x8_t im_res1_1 = - vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset); - uint16x8_t im_res1_2 = - vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset); - uint16x8_t im_res1_3 = - vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset); - - im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits); - im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits); - im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits); - im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits); - - vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0)); - vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1)); - vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2)); - vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3)); -} - -static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, - const CONV_BUF_TYPE *src0, uint32_t src0_stride, - const CONV_BUF_TYPE *src1, uint32_t src1_stride, - int16x4_t mask0, int16x4_t mask1, int16x4_t mask2, - int16x4_t mask3, const int16x8_t v_maxval, - const uint16x8_t vec_round_offset, - const int16x8_t vec_round_bits) { - int16x8_t src0_0, src0_1; - int16x8_t src1_0, src1_1; - uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0), - tu3 = vdupq_n_u64(0); - int16x8_t mask0_1, mask2_3; - int16x8_t res0, res1; - - load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1); - load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3); - - src0_0 = vreinterpretq_s16_u64(tu0); - src0_1 = vreinterpretq_s16_u64(tu1); - - src1_0 = vreinterpretq_s16_u64(tu2); - src1_1 = vreinterpretq_s16_u64(tu3); - - mask0_1 = vcombine_s16(mask0, mask1); - mask2_3 = vcombine_s16(mask2, mask3); - - blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0); - blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1); - - uint16x8_t im_res_0 = - vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset); - uint16x8_t im_res_1 = - vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset); - - src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits); - src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits); - - uint8x8_t res_0 = vqmovun_s16(src0_0); - uint8x8_t res_1 = vqmovun_s16(src0_1); - - vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0), - 0); - vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0), - 1); - vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1), - 0); - vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1), - 1); -} - -void aom_lowbd_blend_a64_d16_mask_neon( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, - ConvolveParams *conv_params) { - int i = 0; - const int bd = 8; - int w_tmp = w; - const uint8_t *mask_tmp = mask; - const CONV_BUF_TYPE *src0_tmp = src0; - const CONV_BUF_TYPE *src1_tmp = src1; - uint8_t *dst_tmp = dst; - - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - - assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - uint8x8_t s0, s1, s2, s3; - uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), - tu3 = vdup_n_u32(0); - uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; - int16x8_t mask0, mask1, mask2, mask3; - int16x8_t mask4, mask5, mask6, mask7; - int32x4_t m0_32, m1_32, m2_32, m3_32; - int32x4_t m4_32, m5_32, m6_32, m7_32; - uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l; - uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l; - int16x4_t mask0_low, mask1_low, mask2_low, mask3_low; - const uint16x4_t vec_zero = vdup_n_u16(0); - const uint16_t offset = round_offset - (1 << (round_bits - 1)); - const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA); - const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); - const uint16x8_t vec_offset = vdupq_n_u16(offset); - - if (subw == 0 && subh == 0) { - if (w_tmp > 7) { - do { - w_tmp = w; - do { - load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3); - - mask0 = vmovl_s8(vreinterpret_s8_u8(s0)); - mask1 = vmovl_s8(vreinterpret_s8_u8(s1)); - mask2 = vmovl_s8(vreinterpret_s8_u8(s2)); - mask3 = vmovl_s8(vreinterpret_s8_u8(s3)); - - blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0, mask1, mask2, mask3, v_maxval, - vec_offset, vec_round_bits); - - w_tmp -= 8; - mask_tmp += 8; - dst_tmp += 8; - src0_tmp += 8; - src1_tmp += 8; - } while (w_tmp > 7); - i += 4; - mask_tmp += (4 * mask_stride) - w; - dst_tmp += (4 * dst_stride) - w; - src0_tmp += (4 * src0_stride) - w; - src1_tmp += (4 * src1_stride) - w; - } while (i < h); - } else { - do { - load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1); - - mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); - mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); - - mask0_low = vget_low_s16(mask0); - mask1_low = vget_high_s16(mask0); - mask2_low = vget_low_s16(mask1); - mask3_low = vget_high_s16(mask1); - - blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, - v_maxval, vec_offset, vec_round_bits); - - i += 4; - mask_tmp += (4 * mask_stride); - dst_tmp += (4 * dst_stride); - src0_tmp += (4 * src0_stride); - src1_tmp += (4 * src1_stride); - } while (i < h); - } - } else if (subw == 1 && subh == 1) { - if (w_tmp > 7) { - do { - w_tmp = w; - do { - load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, - &t7); - - mask0 = - vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1))); - mask1 = - vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3))); - mask2 = - vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5))); - mask3 = - vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7))); - - mask4 = vreinterpretq_s16_u16( - vaddl_u8(vget_high_u8(t0), vget_high_u8(t1))); - mask5 = vreinterpretq_s16_u16( - vaddl_u8(vget_high_u8(t2), vget_high_u8(t3))); - mask6 = vreinterpretq_s16_u16( - vaddl_u8(vget_high_u8(t4), vget_high_u8(t5))); - mask7 = vreinterpretq_s16_u16( - vaddl_u8(vget_high_u8(t6), vget_high_u8(t7))); - - m0_32 = vpaddlq_s16(mask0); - m1_32 = vpaddlq_s16(mask1); - m2_32 = vpaddlq_s16(mask2); - m3_32 = vpaddlq_s16(mask3); - - m4_32 = vpaddlq_s16(mask4); - m5_32 = vpaddlq_s16(mask5); - m6_32 = vpaddlq_s16(mask6); - m7_32 = vpaddlq_s16(mask7); - - mask0 = - vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2)); - mask1 = - vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2)); - mask2 = - vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2)); - mask3 = - vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2)); - - blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0, mask1, mask2, mask3, v_maxval, - vec_offset, vec_round_bits); - - w_tmp -= 8; - mask_tmp += 16; - dst_tmp += 8; - src0_tmp += 8; - src1_tmp += 8; - } while (w_tmp > 7); - i += 4; - mask_tmp += (8 * mask_stride) - (2 * w); - dst_tmp += (4 * dst_stride) - w; - src0_tmp += (4 * src0_stride) - w; - src1_tmp += (4 * src1_stride) - w; - } while (i < h); - } else { - do { - load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, - &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); - - mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); - mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); - mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); - mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); - - m0_32 = vpaddlq_s16(mask0); - m1_32 = vpaddlq_s16(mask1); - m2_32 = vpaddlq_s16(mask2); - m3_32 = vpaddlq_s16(mask3); - - mask0_low = vqrshrn_n_s32(m0_32, 2); - mask1_low = vqrshrn_n_s32(m1_32, 2); - mask2_low = vqrshrn_n_s32(m2_32, 2); - mask3_low = vqrshrn_n_s32(m3_32, 2); - - blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, - v_maxval, vec_offset, vec_round_bits); - - i += 4; - mask_tmp += (8 * mask_stride); - dst_tmp += (4 * dst_stride); - src0_tmp += (4 * src0_stride); - src1_tmp += (4 * src1_stride); - } while (i < h); - } - } else if (subw == 1 && subh == 0) { - if (w_tmp > 7) { - do { - w_tmp = w; - do { - load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3); - - mask0 = vreinterpretq_s16_u16(vcombine_u16( - vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0)))); - mask1 = vreinterpretq_s16_u16(vcombine_u16( - vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1)))); - mask2 = vreinterpretq_s16_u16(vcombine_u16( - vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2)))); - mask3 = vreinterpretq_s16_u16(vcombine_u16( - vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3)))); - - mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); - mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); - mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); - mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); - - blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0, mask1, mask2, mask3, v_maxval, - vec_offset, vec_round_bits); - w_tmp -= 8; - mask_tmp += 16; - dst_tmp += 8; - src0_tmp += 8; - src1_tmp += 8; - } while (w_tmp > 7); - i += 4; - mask_tmp += (4 * mask_stride) - (2 * w); - dst_tmp += (4 * dst_stride) - w; - src0_tmp += (4 * src0_stride) - w; - src1_tmp += (4 * src1_stride) - w; - } while (i < h); - } else { - do { - load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, - &mask3_l); - - mask0 = - vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero)); - mask1 = - vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero)); - mask2 = - vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero)); - mask3 = - vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero)); - - mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1))); - mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1))); - mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1))); - mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1))); - - blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, - v_maxval, vec_offset, vec_round_bits); - - i += 4; - mask_tmp += (4 * mask_stride); - dst_tmp += (4 * dst_stride); - src0_tmp += (4 * src0_stride); - src1_tmp += (4 * src1_stride); - } while (i < h); - } - } else { - if (w_tmp > 7) { - do { - w_tmp = w; - do { - load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, - &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); - - mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); - mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); - mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); - mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); - - mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); - mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); - mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); - mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); - - blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0, mask1, mask2, mask3, v_maxval, - vec_offset, vec_round_bits); - - w_tmp -= 8; - mask_tmp += 8; - dst_tmp += 8; - src0_tmp += 8; - src1_tmp += 8; - } while (w_tmp > 7); - i += 4; - mask_tmp += (8 * mask_stride) - w; - dst_tmp += (4 * dst_stride) - w; - src0_tmp += (4 * src0_stride) - w; - src1_tmp += (4 * src1_stride) - w; - } while (i < h); - } else { - do { - load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1); - load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2, - &tu3); - - s0 = vreinterpret_u8_u32(tu0); - s1 = vreinterpret_u8_u32(tu1); - s2 = vreinterpret_u8_u32(tu2); - s3 = vreinterpret_u8_u32(tu3); - - mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2)); - mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3)); - - mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); - mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); - - mask0_low = vget_low_s16(mask0); - mask1_low = vget_high_s16(mask0); - mask2_low = vget_low_s16(mask1); - mask3_low = vget_high_s16(mask1); - - blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, - src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, - v_maxval, vec_offset, vec_round_bits); - - i += 4; - mask_tmp += (8 * mask_stride); - dst_tmp += (4 * dst_stride); - src0_tmp += (4 * src0_stride); - src1_tmp += (4 * src1_stride); - } while (i < h); - } - } -} diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c deleted file mode 100644 index e4300c992..000000000 --- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" - -#include "aom_dsp/txfm_common.h" - -void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { - int i; - // stage 1 - int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; - const int16x8_t v_s0 = vaddq_s16(input_0, input_7); - const int16x8_t v_s1 = vaddq_s16(input_1, input_6); - const int16x8_t v_s2 = vaddq_s16(input_2, input_5); - const int16x8_t v_s3 = vaddq_s16(input_3, input_4); - const int16x8_t v_s4 = vsubq_s16(input_3, input_4); - const int16x8_t v_s5 = vsubq_s16(input_2, input_5); - const int16x8_t v_s6 = vsubq_s16(input_1, input_6); - const int16x8_t v_s7 = vsubq_s16(input_0, input_7); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } - // transpose 8x8 - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - input_0 = r01_s16.val[0]; - input_1 = r01_s16.val[1]; - input_2 = r23_s16.val[0]; - input_3 = r23_s16.val[1]; - input_4 = r45_s16.val[0]; - input_5 = r45_s16.val[1]; - input_6 = r67_s16.val[0]; - input_7 = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } // for - { - // from aom_dct_sse2.c - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); - const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); - const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); - const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); - const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); - const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); - const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); - const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); - input_0 = vhsubq_s16(input_0, sign_in0); - input_1 = vhsubq_s16(input_1, sign_in1); - input_2 = vhsubq_s16(input_2, sign_in2); - input_3 = vhsubq_s16(input_3, sign_in3); - input_4 = vhsubq_s16(input_4, sign_in4); - input_5 = vhsubq_s16(input_5, sign_in5); - input_6 = vhsubq_s16(input_6, sign_in6); - input_7 = vhsubq_s16(input_7, sign_in7); - // store results - vst1q_s16(&final_output[0 * 8], input_0); - vst1q_s16(&final_output[1 * 8], input_1); - vst1q_s16(&final_output[2 * 8], input_2); - vst1q_s16(&final_output[3 * 8], input_3); - vst1q_s16(&final_output[4 * 8], input_4); - vst1q_s16(&final_output[5 * 8], input_5); - vst1q_s16(&final_output[6 * 8], input_6); - vst1q_s16(&final_output[7 * 8], input_7); - } -} - -void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; - } -} diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c deleted file mode 100644 index c85b1e910..000000000 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ /dev/null @@ -1,590 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -//------------------------------------------------------------------------------ -// DC 4x4 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_top = vcombine_u16(p1, p1); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_left = vcombine_u16(p1, p1); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 3); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 2); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 2); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); - } - } -} - -void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_4x4(dst, stride, above, left, 1, 1); -} - -void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - dc_4x4(dst, stride, NULL, left, 0, 1); -} - -void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - dc_4x4(dst, stride, above, NULL, 1, 0); -} - -void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - dc_4x4(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 8x8 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_top = vcombine_u16(p2, p2); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_left = vcombine_u16(p2, p2); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 4); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 3); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 3); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); - } - } -} - -void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_8x8(dst, stride, above, left, 1, 1); -} - -void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - dc_8x8(dst, stride, NULL, left, 0, 1); -} - -void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - dc_8x8(dst, stride, above, NULL, 1, 0); -} - -void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - dc_8x8(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 16x16 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_top = vcombine_u16(p3, p3); - } - - if (do_left) { - const uint8x16_t L = vld1q_u8(left); // left row - const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_left = vcombine_u16(p3, p3); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 5); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 4); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 4); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 16; ++i) { - vst1q_u8(dst + i * stride, dc); - } - } -} - -void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_16x16(dst, stride, above, left, 1, 1); -} - -void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - dc_16x16(dst, stride, NULL, left, 0, 1); -} - -void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - dc_16x16(dst, stride, above, NULL, 1, 0); -} - -void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - dc_16x16(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 32x32 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t A1 = vld1q_u8(above + 16); - const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top - const uint16x8_t p1 = vpaddlq_u8(A1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_top = vcombine_u16(p5, p5); - } - - if (do_left) { - const uint8x16_t L0 = vld1q_u8(left); // left row - const uint8x16_t L1 = vld1q_u8(left + 16); - const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left - const uint16x8_t p1 = vpaddlq_u8(L1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_left = vcombine_u16(p5, p5); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 6); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 5); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 5); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 32; ++i) { - vst1q_u8(dst + i * stride, dc); - vst1q_u8(dst + i * stride + 16, dc); - } - } -} - -void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_32x32(dst, stride, above, left, 1, 1); -} - -void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - dc_32x32(dst, stride, NULL, left, 0, 1); -} - -void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - dc_32x32(dst, stride, above, NULL, 1, 0); -} - -void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - dc_32x32(dst, stride, NULL, NULL, 0, 0); -} - -// ----------------------------------------------------------------------------- - -void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD_u8 = vld1_u8(above - 1); - const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); - const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); - const uint32x2_t zero = vdup_n_u32(0); - const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); - const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); - const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); - const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); - const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); - const uint8_t D = vget_lane_u8(XABCD_u8, 4); - const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); - const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); - const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); -} - -void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint32x2_t d0u32 = vdup_n_u32(0); - (void)left; - - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); -} - -void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x8_t d0u8 = vdup_n_u8(0); - (void)left; - - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); -} - -void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - (void)left; - - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); -} - -void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)left; - - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - } -} - -void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); - (void)above; - - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); -} - -void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); - (void)above; - - d1u64 = vld1_u64((const uint64_t *)left); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); -} - -void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; - - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - dst += stride; - } -} - -void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; - - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - } - } -} - -static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - const uint16_t *above, - const uint16_t *left) { - assert(bw >= 4); - assert(IS_POWER_OF_TWO(bw)); - int expected_dc, sum = 0; - const int count = bw * 2; - uint32x4_t sum_q = vdupq_n_u32(0); - uint32x2_t sum_d; - uint16_t *dst_1; - if (bw >= 8) { - for (int i = 0; i < bw; i += 8) { - sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); - sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); - above += 8; - left += 8; - } - sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); - sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); - expected_dc = (sum + (count >> 1)) / count; - const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); - for (int r = 0; r < bw; r++) { - dst_1 = dst; - for (int i = 0; i < bw; i += 8) { - vst1q_u16(dst_1, dc); - dst_1 += 8; - } - dst += stride; - } - } else { // 4x4 - sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); - sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); - sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); - expected_dc = (sum + (count >> 1)) / count; - const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); - for (int r = 0; r < bw; r++) { - vst1_u16(dst, dc); - dst += stride; - } - } -} - -#define intra_pred_highbd_sized_neon(type, width) \ - void aom_highbd_##type##_predictor_##width##x##width##_neon( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ - (void)bd; \ - highbd_##type##_predictor(dst, stride, width, above, left); \ - } - -#define intra_pred_square(type) \ - intra_pred_highbd_sized_neon(type, 4); \ - intra_pred_highbd_sized_neon(type, 8); \ - intra_pred_highbd_sized_neon(type, 16); \ - intra_pred_highbd_sized_neon(type, 32); \ - intra_pred_highbd_sized_neon(type, 64); - -intra_pred_square(dc); -#undef intra_pred_square diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c deleted file mode 100644 index bdc67626d..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ /dev/null @@ -1,928 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_dsp_rtcd.h" -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "av1/common/arm/mem_neon.h" -#include "av1/common/arm/transpose_neon.h" - -static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, - uint8x8_t p0q0, const uint8_t blimit, - const uint8_t limit) { - // Calculate mask values for four samples - uint32x2x2_t p0q0_p1q1; - uint16x8_t temp_16x8; - uint16x4_t temp0_16x4, temp1_16x4; - uint8x8_t mask_8x8, temp_8x8; - const uint8x8_t limit_8x8 = vdup_n_u8(limit); - const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); - - mask_8x8 = vabd_u8(p3q3, p2q2); - mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); - mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); - mask_8x8 = vcle_u8(mask_8x8, limit_8x8); - - temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); - temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), - vreinterpret_u8_u32(p0q0_p1q1.val[1])); - temp_16x8 = vmovl_u8(temp_8x8); - temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); - temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); - temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); - temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); - temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); - - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - return mask_8x8; -} - -static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, - const uint8_t blimit, const uint8_t limit) { - uint32x2x2_t p0q0_p1q1; - uint16x8_t temp_16x8; - uint16x4_t temp0_16x4, temp1_16x4; - const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); - const uint8x8_t limit_8x8 = vdup_n_u8(limit); - uint8x8_t mask_8x8, temp_8x8; - - mask_8x8 = vabd_u8(p1q1, p0q0); - mask_8x8 = vcle_u8(mask_8x8, limit_8x8); - - temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); - temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), - vreinterpret_u8_u32(p0q0_p1q1.val[1])); - temp_16x8 = vmovl_u8(temp_8x8); - temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); - temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); - temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); - temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); - temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); - - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - return mask_8x8; -} - -static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, - uint8x8_t p1q1, uint8x8_t p0q0) { - const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 - uint8x8_t flat_8x8, temp_8x8; - - flat_8x8 = vabd_u8(p1q1, p0q0); - flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); - flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); - flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); - - temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); - flat_8x8 = vand_u8(flat_8x8, temp_8x8); - - return flat_8x8; -} - -static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, - uint8x8_t p0q0) { - const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 - uint8x8_t flat_8x8, temp_8x8; - - flat_8x8 = vabd_u8(p1q1, p0q0); - flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); - flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); - - temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); - flat_8x8 = vand_u8(flat_8x8, temp_8x8); - - return flat_8x8; -} - -static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, - uint8x8_t p0q0, const uint8_t blimit, - const uint8_t limit) { - // Calculate mask3 values for four samples - uint32x2x2_t p0q0_p1q1; - uint16x8_t temp_16x8; - uint16x4_t temp0_16x4, temp1_16x4; - uint8x8_t mask_8x8, temp_8x8; - const uint8x8_t limit_8x8 = vdup_n_u8(limit); - const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); - - mask_8x8 = vabd_u8(p2q2, p1q1); - mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); - mask_8x8 = vcle_u8(mask_8x8, limit_8x8); - - temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); - temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), - vreinterpret_u8_u32(p0q0_p1q1.val[1])); - temp_16x8 = vmovl_u8(temp_8x8); - temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); - temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); - temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); - temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); - temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); - - mask_8x8 = vand_u8(mask_8x8, temp_8x8); - - return mask_8x8; -} - -static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, - uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, - uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, - out_f14_pq5; - uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8, flat2_8x8; - uint8x8_t q0p0, q1p1, q2p2; - - // Calculate filter masks - mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); - flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - hev_8x8 = vmvn_s8(hev_8x8); - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - // reverse p and q - q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); - { - // filter 8 - uint16x8_t out_pq0, out_pq1, out_pq2; - out = vaddl_u8(*p3q3, *p2q2); - out = vaddw_u8(out, *p1q1); - out = vaddw_u8(out, *p0q0); - - out = vaddw_u8(out, q0p0); - out_pq1 = vaddw_u8(out, *p3q3); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - out_pq2 = vaddw_u8(out_pq2, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p1q1); - out_pq1 = vaddw_u8(out_pq1, q1p1); - - out_pq0 = vaddw_u8(out, *p0q0); - out_pq0 = vaddw_u8(out_pq0, q1p1); - out_pq0 = vaddw_u8(out_pq0, q2p2); - - out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); - out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); - } - { - // filter 14 - uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; - uint16x8_t p6q6_2, p6q6_temp, qp_sum; - uint8x8_t qp_rev; - - out = vaddw_u8(out, *p4q4); - out = vaddw_u8(out, *p5q5); - out = vaddw_u8(out, *p6q6); - - out_pq5 = vaddw_u8(out, *p4q4); - out_pq4 = vaddw_u8(out_pq5, *p3q3); - out_pq3 = vaddw_u8(out_pq4, *p2q2); - - out_pq5 = vaddw_u8(out_pq5, *p5q5); - out_pq4 = vaddw_u8(out_pq4, *p5q5); - - out_pq0 = vaddw_u8(out, *p1q1); - out_pq1 = vaddw_u8(out_pq0, *p2q2); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - - out_pq0 = vaddw_u8(out_pq0, *p0q0); - out_pq1 = vaddw_u8(out_pq1, *p0q0); - - out_pq1 = vaddw_u8(out_pq1, *p6q6); - p6q6_2 = vaddl_u8(*p6q6, *p6q6); - out_pq2 = vaddq_u16(out_pq2, p6q6_2); - p6q6_temp = vaddw_u8(p6q6_2, *p6q6); - out_pq3 = vaddq_u16(out_pq3, p6q6_temp); - p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); - out_pq4 = vaddq_u16(out_pq4, p6q6_temp); - p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); - out_pq5 = vaddq_u16(out_pq5, p6q6_temp); - - out_pq4 = vaddw_u8(out_pq4, q1p1); - - qp_sum = vaddl_u8(q2p2, q1p1); - out_pq3 = vaddq_u16(out_pq3, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq2 = vaddq_u16(out_pq2, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq1 = vaddq_u16(out_pq1, qp_sum); - - qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); - qp_sum = vaddw_u8(qp_sum, qp_rev); - out_pq0 = vaddq_u16(out_pq0, qp_sum); - - out_pq0 = vaddw_u8(out_pq0, q0p0); - - out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); - out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); - out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); - out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); - out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); - out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); - } - { - uint8x8_t filter4_cond, filter8_cond, filter14_cond; - filter8_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter8_cond); - filter14_cond = vand_u8(filter8_cond, flat2_8x8); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter8 outputs - *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); - *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); - *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); - - // filter14 outputs - *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); - *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); - *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); - *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); - *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); - *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); - } -} - -static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, - uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8; - - // Calculate filter masks - mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - hev_8x8 = vmvn_s8(hev_8x8); - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - { - // filter 8 - uint16x8_t out_pq0, out_pq1, out_pq2; - uint8x8_t q0p0, q1p1, q2p2; - - out = vaddl_u8(*p3q3, *p2q2); - out = vaddw_u8(out, *p1q1); - out = vaddw_u8(out, *p0q0); - - // reverse p and q - q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); - - out = vaddw_u8(out, q0p0); - out_pq1 = vaddw_u8(out, *p3q3); - out_pq2 = vaddw_u8(out_pq1, *p3q3); - out_pq2 = vaddw_u8(out_pq2, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p1q1); - out_pq1 = vaddw_u8(out_pq1, q1p1); - - out_pq0 = vaddw_u8(out, *p0q0); - out_pq0 = vaddw_u8(out_pq0, q1p1); - out_pq0 = vaddw_u8(out_pq0, q2p2); - - out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); - out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); - } - { - uint8x8_t filter4_cond, filter8_cond; - filter8_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter8_cond); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter8 outputs - *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); - *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); - *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); - } -} - -static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, - const uint8_t blimit, const uint8_t limit, - const uint8_t thresh) { - uint16x8_t out; - uint8x8_t out_f6_pq0, out_f6_pq1; - uint8x8_t out_f4_pq0, out_f4_pq1; - uint8x8_t mask_8x8, flat_8x8; - - // Calculate filter masks - mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); - flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); - { - // filter 4 - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vbic_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); - } - { - // filter 6 - uint16x8_t out_pq0, out_pq1; - uint8x8_t pq_rev; - - out = vaddl_u8(*p0q0, *p1q1); - out = vaddq_u16(out, out); - out = vaddw_u8(out, *p2q2); - - pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); - out = vaddw_u8(out, pq_rev); - - out_pq0 = vaddw_u8(out, pq_rev); - pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); - out_pq0 = vaddw_u8(out_pq0, pq_rev); - - out_pq1 = vaddw_u8(out, *p2q2); - out_pq1 = vaddw_u8(out_pq1, *p2q2); - - out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); - out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); - } - { - uint8x8_t filter4_cond, filter6_cond; - filter6_cond = vand_u8(flat_8x8, mask_8x8); - filter4_cond = vmvn_u8(filter6_cond); - - // filter4 outputs - *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); - *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); - - // filter6 outputs - *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); - *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); - } -} - -static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, - const uint8_t limit, const uint8_t thresh) { - int32x2x2_t ps0_qs0, ps1_qs1; - int16x8_t filter_s16; - const uint8x8_t thresh_f4 = vdup_n_u8(thresh); - uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; - int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; - int8x8_t op0, oq0, op1, oq1; - int8x8_t pq_s0, pq_s1; - int8x8_t filter_s8, filter1_s8, filter2_s8; - int8x8_t hev_8x8; - const int8x8_t sign_mask = vdup_n_s8(0x80); - const int8x8_t val_4 = vdup_n_s8(4); - const int8x8_t val_3 = vdup_n_s8(3); - - // Calculate filter mask - mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); - - pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); - pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); - - ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); - ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); - ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); - qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); - ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); - qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); - - // hev_mask - temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); - temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); - hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); - - // add outer taps if we have high edge variance - filter_s8 = vqsub_s8(ps1_s8, qs1_s8); - filter_s8 = vand_s8(filter_s8, hev_8x8); - - // inner taps - temp_s8 = vqsub_s8(qs0_s8, ps0_s8); - filter_s16 = vmovl_s8(filter_s8); - filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); - filter_s8 = vqmovn_s16(filter_s16); - filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); - - filter1_s8 = vqadd_s8(filter_s8, val_4); - filter2_s8 = vqadd_s8(filter_s8, val_3); - filter1_s8 = vshr_n_s8(filter1_s8, 3); - filter2_s8 = vshr_n_s8(filter2_s8, 3); - - oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); - op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); - - filter_s8 = vrshr_n_s8(filter1_s8, 1); - filter_s8 = vbic_s8(filter_s8, hev_8x8); - - oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); - op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); - - *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); - *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); -} - -void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x16_t row0, row1, row2, row3; - uint8x8_t pxp3, p6p2, p5p1, p4p0; - uint8x8_t q0q4, q1q5, q2q6, q3qy; - uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; - uint32x2_t pq_rev; - uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; - - // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y - // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y - // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y - // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y - load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3); - - pxp3 = vget_low_u8(row0); - p6p2 = vget_low_u8(row1); - p5p1 = vget_low_u8(row2); - p4p0 = vget_low_u8(row3); - transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); - - q0q4 = vget_high_u8(row0); - q1q5 = vget_high_u8(row1); - q2q6 = vget_high_u8(row2); - q3qy = vget_high_u8(row3); - transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); - pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); - p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); - p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); - p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); - - p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); - p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); - p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); - p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); - p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); - p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); - p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); - - lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, - *thresh); - - pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); - p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); - p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); - p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); - - pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); - p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); - p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); - p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); - - q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); - q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); - q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); - q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); - transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); - - pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); - p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); - p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); - p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); - transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); - - row0 = vcombine_u8(pxp3, q0q4); - row1 = vcombine_u8(p6p2, q1q5); - row2 = vcombine_u8(p5p1, q2q6); - row3 = vcombine_u8(p4p0, q3qy); - - store_u8_8x16(src - 8, stride, row0, row1, row2, row3); -} - -void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint32x2x2_t p2q2_p1q1, p3q3_p0q0; - uint32x2_t pq_rev; - uint8x8_t p3q0, p2q1, p1q2, p0q3; - uint8x8_t p0q0, p1q1, p2q2, p3q3; - - // row0: p3 p2 p1 p0 | q0 q1 q2 q3 - // row1: p3 p2 p1 p0 | q0 q1 q2 q3 - // row2: p3 p2 p1 p0 | q0 q1 q2 q3 - // row3: p3 p2 p1 p0 | q0 q1 q2 q3 - load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); - - transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); - p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); - p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); - - p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); - p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); - p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); - p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); - - lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); - p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); - p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); - - p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); - p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); - p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); - p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); - transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); - - store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); -} - -void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint32x2x2_t p2q2_p1q1, pxqy_p0q0; - uint32x2_t pq_rev; - uint8x8_t pxq0, p2q1, p1q2, p0qy; - uint8x8_t p0q0, p1q1, p2q2, pxqy; - - // row0: px p2 p1 p0 | q0 q1 q2 qy - // row1: px p2 p1 p0 | q0 q1 q2 qy - // row2: px p2 p1 p0 | q0 q1 q2 qy - // row3: px p2 p1 p0 | q0 q1 q2 qy - load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); - - transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); - pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); - p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); - - p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); - p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); - p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); - pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); - - lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); - pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); - - pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); - p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); - - p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); - p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); - p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); - pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); - transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); - - store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); -} - -void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; - uint32x2_t pq_rev; - uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1; - - // row0: p1 p0 | q0 q1 - // row1: p1 p0 | q0 q1 - // row2: p1 p0 | q0 q1 - // row3: p1 p0 | q0 q1 - load_u8_4x1(src - 2, &p1p0, 0); - load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1); - load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0); - load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1); - - transpose_u8_4x4(&p1p0, &q0q1); - - p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); - - pq_rev = vrev64_u32(p1q0_p0q1.val[1]); - p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); - - p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); - p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); - - lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); - - p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); - - p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); - q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); - - transpose_u8_4x4(&p1p0, &q0q1); - - store_u8_4x1(src - 2, p1p0, 0); - store_u8_4x1((src - 2) + 1 * stride, q0q1, 0); - store_u8_4x1((src - 2) + 2 * stride, p1p0, 1); - store_u8_4x1((src - 2) + 3 * stride, q0q1, 1); -} - -void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6); - - load_u8_4x1(src - 7 * stride, &p6q6, 0); - load_u8_4x1(src - 6 * stride, &p5q5, 0); - load_u8_4x1(src - 5 * stride, &p4q4, 0); - load_u8_4x1(src - 4 * stride, &p3q3, 0); - load_u8_4x1(src - 3 * stride, &p2q2, 0); - load_u8_4x1(src - 2 * stride, &p1q1, 0); - load_u8_4x1(src - 1 * stride, &p0q0, 0); - load_u8_4x1(src + 0 * stride, &p0q0, 1); - load_u8_4x1(src + 1 * stride, &p1q1, 1); - load_u8_4x1(src + 2 * stride, &p2q2, 1); - load_u8_4x1(src + 3 * stride, &p3q3, 1); - load_u8_4x1(src + 4 * stride, &p4q4, 1); - load_u8_4x1(src + 5 * stride, &p5q5, 1); - load_u8_4x1(src + 6 * stride, &p6q6, 1); - - lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, - *thresh); - - store_u8_4x1(src - 6 * stride, p5q5, 0); - store_u8_4x1(src - 5 * stride, p4q4, 0); - store_u8_4x1(src - 4 * stride, p3q3, 0); - store_u8_4x1(src - 3 * stride, p2q2, 0); - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); - store_u8_4x1(src + 2 * stride, p2q2, 1); - store_u8_4x1(src + 3 * stride, p3q3, 1); - store_u8_4x1(src + 4 * stride, p4q4, 1); - store_u8_4x1(src + 5 * stride, p5q5, 1); -} - -void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t p0q0, p1q1, p2q2, p3q3; - - p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); - p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); - p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); - p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); - p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), - vreinterpret_u32_u8(p0q0), 1)); - p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), - vreinterpret_u32_u8(p1q1), 1)); - p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), - vreinterpret_u32_u8(p2q2), 1)); - p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), - vreinterpret_u32_u8(p3q3), 1)); - - lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - - vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); - vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); - vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); - vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); - vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); - vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); - vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); - vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); -} - -void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t p0q0, p1q1, p2q2; - - p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); - p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); - p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); - p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), - vreinterpret_u32_u8(p0q0), 1)); - p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), - vreinterpret_u32_u8(p1q1), 1)); - p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), - vreinterpret_u32_u8(p2q2), 1)); - - lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - - vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); - vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); - vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); - vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); - vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); - vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); -} - -void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1); - - load_u8_4x1(src - 2 * stride, &p1q1, 0); - load_u8_4x1(src - 1 * stride, &p0q0, 0); - load_u8_4x1(src + 0 * stride, &p0q0, 1); - load_u8_4x1(src + 1 * stride, &p1q1, 1); - - lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); - - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); -} diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c deleted file mode 100644 index 606950ab2..000000000 --- a/third_party/aom/aom_dsp/arm/sad4d_neon.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, -// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo -// and vec_sum_ref_hi. -static void sad_neon_64(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); -} - -// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, -// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. -static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); -} - -void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} - -void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - - sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, - &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, - &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, - &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, - &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} - -void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); - - vec_sum_ref0_lo = - vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = - vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = - vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = - vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c deleted file mode 100644 index a39de91d6..000000000 --- a/third_party/aom/aom_dsp/arm/sad_neon.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - -unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; - int i; - - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); - - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); -} - -unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; - - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); - } - - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); -} - -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); - } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); -} - -unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} - -unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = - vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); - vec_accum_hi = - vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} - -unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum = vdupq_n_u16(0); - - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); - } - return horizontal_add_16x8(vec_accum); -} diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c deleted file mode 100644 index cf618eee7..000000000 --- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_dsp_rtcd.h" -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" - -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/variance.h" - -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(&output_ptr[0], out); - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } -} - -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } -} - -unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); - - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters_2t[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters_2t[yoffset]); - return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); -} - -unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); - DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters_2t[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters_2t[yoffset]); - return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); - DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters_2t[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters_2t[yoffset]); - return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); -} - -unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters_2t[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters_2t[yoffset]); - return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); -} diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c deleted file mode 100644 index 28f5ace8e..000000000 --- a/third_party/aom/aom_dsp/arm/subtract_neon.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -void aom_subtract_block_neon(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src, - ptrdiff_t src_stride, const uint8_t *pred, - ptrdiff_t pred_stride) { - int r, c; - - if (cols > 16) { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = - vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = - vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = - vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = - vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); - } - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = - vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = - vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } -} diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c deleted file mode 100644 index 74385a601..000000000 --- a/third_party/aom/aom_dsp/arm/variance_neon.c +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <arm_neon.h> - -#include "config/aom_dsp_rtcd.h" -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = - vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); - v_sse_hi = - vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; - } - - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); -} - -void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); -} - -void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); -} - -unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - ((sum * sum) >> 6); -} - -unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8); -} - -unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} - -unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, - 32, 32, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} - -unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} - -unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} - -unsigned int aom_variance16x8_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int aom_variance8x16_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c deleted file mode 100644 index 01088010a..000000000 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/binary_codes_reader.h" - -#include "av1/common/common.h" - -// Inverse recenters a non-negative literal v around a reference r -static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { - if (v > (r << 1)) - return v; - else if ((v & 1) == 0) - return (v >> 1) + r; - else - return r - ((v + 1) >> 1); -} - -// Inverse recenters a non-negative literal v in [0, n-1] around a -// reference r also in [0, n-1] -static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { - if ((r << 1) <= n) { - return inv_recenter_nonneg(r, v); - } else { - return n - 1 - inv_recenter_nonneg(n - 1 - r, v); - } -} - -uint16_t aom_read_primitive_quniform_(aom_reader *r, - uint16_t n ACCT_STR_PARAM) { - if (n <= 1) return 0; - const int l = get_msb(n) + 1; - const int m = (1 << l) - n; - const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); - return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); -} - -static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, - uint16_t n) { - if (n <= 1) return 0; - const int l = get_msb(n) + 1; - const int m = (1 << l) - n; - const int v = aom_rb_read_literal(rb, l - 1); - return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); -} - -// Decode finite subexponential code that for a symbol v in [0, n-1] with -// parameter k -uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, - uint16_t k ACCT_STR_PARAM) { - int i = 0; - int mk = 0; - - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - - if (n <= mk + 3 * a) { - return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; - } - - if (!aom_read_bit(r, ACCT_STR_NAME)) { - return aom_read_literal(r, b, ACCT_STR_NAME) + mk; - } - - i = i + 1; - mk += a; - } - - assert(0); - return 0; -} - -static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, - uint16_t n, uint16_t k) { - int i = 0; - int mk = 0; - - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - - if (n <= mk + 3 * a) { - return aom_rb_read_primitive_quniform(rb, n - mk) + mk; - } - - if (!aom_rb_read_bit(rb)) { - return aom_rb_read_literal(rb, b) + mk; - } - - i = i + 1; - mk += a; - } - - assert(0); - return 0; -} - -uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, - uint16_t ref ACCT_STR_PARAM) { - return inv_recenter_finite_nonneg( - n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); -} - -static uint16_t aom_rb_read_primitive_refsubexpfin( - struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { - return inv_recenter_finite_nonneg(n, ref, - aom_rb_read_primitive_subexpfin(rb, n, k)); -} - -int16_t aom_rb_read_signed_primitive_refsubexpfin( - struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { - ref += n - 1; - const uint16_t scaled_n = (n << 1) - 1; - return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; -} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h deleted file mode 100644 index 364a67469..000000000 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ -#define AOM_AOM_DSP_BINARY_CODES_READER_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/bitreader.h" -#include "aom_dsp/bitreader_buffer.h" - -#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \ - aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ - aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ - aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) - -uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM); -uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, - uint16_t k ACCT_STR_PARAM); -uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, - uint16_t ref ACCT_STR_PARAM); - -int16_t aom_rb_read_signed_primitive_refsubexpfin( - struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c deleted file mode 100644 index ee7a9f567..000000000 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/bitwriter.h" -#include "aom_dsp/binary_codes_writer.h" - -#include "av1/common/common.h" - -// Recenters a non-negative literal v around a reference r -static uint16_t recenter_nonneg(uint16_t r, uint16_t v) { - if (v > (r << 1)) - return v; - else if (v >= r) - return ((v - r) << 1); - else - return ((r - v) << 1) - 1; -} - -// Recenters a non-negative literal v in [0, n-1] around a -// reference r also in [0, n-1] -static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { - if ((r << 1) <= n) { - return recenter_nonneg(r, v); - } else { - return recenter_nonneg(n - 1 - r, n - 1 - v); - } -} - -// Codes a symbol v in [-2^mag_bits, 2^mag_bits]. -// mag_bits is number of bits for magnitude. The alphabet is of size -// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to -// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide -// and 1 more bit for the sign if non-zero. -void aom_write_primitive_symmetric(aom_writer *w, int16_t v, - unsigned int abs_bits) { - if (v == 0) { - aom_write_bit(w, 0); - } else { - const int x = abs(v); - const int s = v < 0; - aom_write_bit(w, 1); - aom_write_bit(w, s); - aom_write_literal(w, x - 1, abs_bits); - } -} - -int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { - return (v == 0 ? 1 : abs_bits + 2); -} - -// Encodes a value v in [0, n-1] quasi-uniformly -void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { - if (n <= 1) return; - const int l = get_msb(n) + 1; - const int m = (1 << l) - n; - if (v < m) { - aom_write_literal(w, v, l - 1); - } else { - aom_write_literal(w, m + ((v - m) >> 1), l - 1); - aom_write_bit(w, (v - m) & 1); - } -} - -static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, - uint16_t n, uint16_t v) { - if (n <= 1) return; - const int l = get_msb(n) + 1; - const int m = (1 << l) - n; - if (v < m) { - aom_wb_write_literal(wb, v, l - 1); - } else { - aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); - aom_wb_write_bit(wb, (v - m) & 1); - } -} - -int aom_count_primitive_quniform(uint16_t n, uint16_t v) { - if (n <= 1) return 0; - const int l = get_msb(n) + 1; - const int m = (1 << l) - n; - return v < m ? l - 1 : l; -} - -// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k -void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, - uint16_t v) { - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (n <= mk + 3 * a) { - aom_write_primitive_quniform(w, n - mk, v - mk); - break; - } else { - int t = (v >= mk + a); - aom_write_bit(w, t); - if (t) { - i = i + 1; - mk += a; - } else { - aom_write_literal(w, v - mk, b); - break; - } - } - } -} - -static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, - uint16_t n, uint16_t k, - uint16_t v) { - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (n <= mk + 3 * a) { - aom_wb_write_primitive_quniform(wb, n - mk, v - mk); - break; - } else { - int t = (v >= mk + a); - aom_wb_write_bit(wb, t); - if (t) { - i = i + 1; - mk += a; - } else { - aom_wb_write_literal(wb, v - mk, b); - break; - } - } - } -} - -int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { - int count = 0; - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (n <= mk + 3 * a) { - count += aom_count_primitive_quniform(n - mk, v - mk); - break; - } else { - int t = (v >= mk + a); - count++; - if (t) { - i = i + 1; - mk += a; - } else { - count += b; - break; - } - } - } - return count; -} - -// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k -// based on a reference ref also in [0, n-1]. -// Recenters symbol around r first and then uses a finite subexponential code. -void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, - uint16_t ref, uint16_t v) { - aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); -} - -static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, - uint16_t n, uint16_t k, - uint16_t ref, uint16_t v) { - aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); -} - -void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, - uint16_t k, int16_t ref, - int16_t v) { - ref += n - 1; - v += n - 1; - const uint16_t scaled_n = (n << 1) - 1; - aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); -} - -void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, - uint16_t n, uint16_t k, - int16_t ref, int16_t v) { - ref += n - 1; - v += n - 1; - const uint16_t scaled_n = (n << 1) - 1; - aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); -} - -int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, - uint16_t v) { - return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); -} - -int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, - int16_t v) { - ref += n - 1; - v += n - 1; - const uint16_t scaled_n = (n << 1) - 1; - return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); -} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h deleted file mode 100644 index c360e0e29..000000000 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ -#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include <assert.h> -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/bitwriter.h" -#include "aom_dsp/bitwriter_buffer.h" - -// Codes a symbol v in [-2^mag_bits, 2^mag_bits] -// mag_bits is number of bits for magnitude. The alphabet is of size -// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to -// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide -// and 1 more bit for the sign if non-zero. -void aom_write_primitive_symmetric(aom_writer *w, int16_t v, - unsigned int mag_bits); - -// Encodes a value v in [0, n-1] quasi-uniformly -void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); - -// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k -void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, - uint16_t v); - -// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k -// based on a reference ref also in [0, n-1]. -void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, - uint16_t ref, uint16_t v); - -// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with -// parameter k based on a reference ref also in [-(n-1), n-1]. -void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, - uint16_t k, int16_t ref, - int16_t v); - -void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, - uint16_t n, uint16_t k, - int16_t ref, int16_t v); - -// Functions that counts bits for the above primitives -int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); -int aom_count_primitive_quniform(uint16_t n, uint16_t v); -int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); -int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, - uint16_t v); -int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, - int16_t v); -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h deleted file mode 100644 index 7c0efcc78..000000000 --- a/third_party/aom/aom_dsp/bitreader.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BITREADER_H_ -#define AOM_AOM_DSP_BITREADER_H_ - -#include <assert.h> -#include <limits.h> - -#include "config/aom_config.h" - -#include "aom/aomdx.h" -#include "aom/aom_integer.h" -#include "aom_dsp/daalaboolreader.h" -#include "aom_dsp/prob.h" -#include "av1/common/odintrin.h" - -#if CONFIG_ACCOUNTING -#include "av1/decoder/accounting.h" -#define ACCT_STR_NAME acct_str -#define ACCT_STR_PARAM , const char *ACCT_STR_NAME -#define ACCT_STR_ARG(s) , s -#else -#define ACCT_STR_PARAM -#define ACCT_STR_ARG(s) -#endif - -#define aom_read(r, prob, ACCT_STR_NAME) \ - aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_bit(r, ACCT_STR_NAME) \ - aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ - aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_literal(r, bits, ACCT_STR_NAME) \ - aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ - aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ - aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct daala_reader aom_reader; - -static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer, - size_t size) { - return aom_daala_reader_init(r, buffer, (int)size); -} - -static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) { - return aom_daala_reader_find_begin(r); -} - -static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) { - return aom_daala_reader_find_end(r); -} - -static INLINE int aom_reader_has_error(aom_reader *r) { - return aom_daala_reader_has_error(r); -} - -// Returns true if the bit reader has tried to decode more data from the buffer -// than was actually provided. -static INLINE int aom_reader_has_overflowed(const aom_reader *r) { - return aom_daala_reader_has_overflowed(r); -} - -// Returns the position in the bit reader in bits. -static INLINE uint32_t aom_reader_tell(const aom_reader *r) { - return aom_daala_reader_tell(r); -} - -// Returns the position in the bit reader in 1/8th bits. -static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) { - return aom_daala_reader_tell_frac(r); -} - -#if CONFIG_ACCOUNTING -static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { - if (r->accounting != NULL) { - uint32_t tell_frac; - tell_frac = aom_reader_tell_frac(r); - aom_accounting_record(r->accounting, ACCT_STR_NAME, - tell_frac - r->accounting->last_tell_frac); - r->accounting->last_tell_frac = tell_frac; - } -} - -static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { - if (r->accounting != NULL) { - r->accounting->syms.num_multi_syms += !is_binary; - r->accounting->syms.num_binary_syms += !!is_binary; - } -} -#endif - -static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { - int ret; - ret = aom_daala_read(r, prob); -#if CONFIG_ACCOUNTING - if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); - aom_update_symb_counts(r, 1); -#endif - return ret; -} - -static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { - int ret; - ret = aom_read(r, 128, NULL); // aom_prob_half -#if CONFIG_ACCOUNTING - if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); -#endif - return ret; -} - -static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { - int literal = 0, bit; - - for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; -#if CONFIG_ACCOUNTING - if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); -#endif - return literal; -} - -static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, - int nsymbs ACCT_STR_PARAM) { - int ret; - ret = daala_read_symbol(r, cdf, nsymbs); - -#if CONFIG_ACCOUNTING - if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); - aom_update_symb_counts(r, (nsymbs == 2)); -#endif - return ret; -} - -static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, - int nsymbs ACCT_STR_PARAM) { - int ret; - ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); - if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs); - return ret; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c deleted file mode 100644 index b53211784..000000000 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom_dsp/bitreader_buffer.h" - -size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { - return (rb->bit_offset + 7) >> 3; -} - -int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { - const uint32_t off = rb->bit_offset; - const uint32_t p = off >> 3; - const int q = 7 - (int)(off & 0x7); - if (rb->bit_buffer + p < rb->bit_buffer_end) { - const int bit = (rb->bit_buffer[p] >> q) & 1; - rb->bit_offset = off + 1; - return bit; - } else { - if (rb->error_handler) rb->error_handler(rb->error_handler_data); - return 0; - } -} - -int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { - assert(bits <= 31); - int value = 0, bit; - for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; - return value; -} - -uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, - int bits) { - assert(bits <= 32); - uint32_t value = 0; - int bit; - for (bit = bits - 1; bit >= 0; bit--) - value |= (uint32_t)aom_rb_read_bit(rb) << bit; - return value; -} - -int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { - const int nbits = sizeof(unsigned) * 8 - bits - 1; - const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; - return ((int)value) >> nbits; -} - -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { - int leading_zeros = 0; - while (!aom_rb_read_bit(rb)) ++leading_zeros; - // Maximum 32 bits. - if (leading_zeros >= 32) return UINT32_MAX; - const uint32_t base = (1u << leading_zeros) - 1; - const uint32_t value = aom_rb_read_literal(rb, leading_zeros); - return base + value; -} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h deleted file mode 100644 index 725ca1ea2..000000000 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ -#define AOM_AOM_DSP_BITREADER_BUFFER_H_ - -#include <limits.h> - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void (*aom_rb_error_handler)(void *data); - -struct aom_read_bit_buffer { - const uint8_t *bit_buffer; - const uint8_t *bit_buffer_end; - uint32_t bit_offset; - - void *error_handler_data; - aom_rb_error_handler error_handler; -}; - -size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); - -int aom_rb_read_bit(struct aom_read_bit_buffer *rb); - -int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); - -uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); - -int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); - -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h deleted file mode 100644 index b5ecc2382..000000000 --- a/third_party/aom/aom_dsp/bitwriter.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BITWRITER_H_ -#define AOM_AOM_DSP_BITWRITER_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom_dsp/daalaboolwriter.h" -#include "aom_dsp/prob.h" - -#if CONFIG_RD_DEBUG -#include "av1/common/blockd.h" -#include "av1/encoder/cost.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct daala_writer aom_writer; - -typedef struct TOKEN_STATS { - int cost; -#if CONFIG_RD_DEBUG - int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; -#endif -} TOKEN_STATS; - -static INLINE void init_token_stats(TOKEN_STATS *token_stats) { -#if CONFIG_RD_DEBUG - int r, c; - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { - token_stats->txb_coeff_cost_map[r][c] = 0; - } - } -#endif - token_stats->cost = 0; -} - -static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { - aom_daala_start_encode(bc, buffer); -} - -static INLINE int aom_stop_encode(aom_writer *bc) { - return aom_daala_stop_encode(bc); -} - -static INLINE void aom_write(aom_writer *br, int bit, int probability) { - aom_daala_write(br, bit, probability); -} - -static INLINE void aom_write_bit(aom_writer *w, int bit) { - aom_write(w, bit, 128); // aom_prob_half -} - -static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { - int bit; - - for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); -} - -static INLINE void aom_write_cdf(aom_writer *w, int symb, - const aom_cdf_prob *cdf, int nsymbs) { - daala_write_symbol(w, symb, cdf, nsymbs); -} - -static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, - int nsymbs) { - aom_write_cdf(w, symb, cdf, nsymbs); - if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c deleted file mode 100644 index 596246deb..000000000 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <limits.h> -#include <stdlib.h> - -#include "config/aom_config.h" - -#include "aom_dsp/bitwriter_buffer.h" - -int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) { - return (wb->bit_offset % CHAR_BIT == 0); -} - -uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { - return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); -} - -void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { - const int off = (int)wb->bit_offset; - const int p = off / CHAR_BIT; - const int q = CHAR_BIT - 1 - off % CHAR_BIT; - if (q == CHAR_BIT - 1) { - // Zero next char and write bit - wb->bit_buffer[p] = bit << q; - } else { - wb->bit_buffer[p] &= ~(1 << q); - wb->bit_buffer[p] |= bit << q; - } - wb->bit_offset = off + 1; -} - -void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { - // Do not zero bytes but overwrite exisiting values - const int off = (int)wb->bit_offset; - const int p = off / CHAR_BIT; - const int q = CHAR_BIT - 1 - off % CHAR_BIT; - wb->bit_buffer[p] &= ~(1 << q); - wb->bit_buffer[p] |= bit << q; - wb->bit_offset = off + 1; -} - -void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { - assert(bits <= 31); - int bit; - for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); -} - -void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, - uint32_t data, int bits) { - assert(bits <= 32); - int bit; - for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); -} - -void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, - int bits) { - int bit; - for (bit = bits - 1; bit >= 0; bit--) - aom_wb_overwrite_bit(wb, (data >> bit) & 1); -} - -void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, - int bits) { - aom_wb_write_literal(wb, data, bits + 1); -} - -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { - int64_t shift_val = ++v; - int leading_zeroes = 1; - - assert(shift_val > 0); - - while (shift_val >>= 1) leading_zeroes += 2; - - aom_wb_write_literal(wb, 0, leading_zeroes >> 1); - aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); -} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h deleted file mode 100644 index d0311284f..000000000 --- a/third_party/aom/aom_dsp/bitwriter_buffer.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ -#define AOM_AOM_DSP_BITWRITER_BUFFER_H_ - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct aom_write_bit_buffer { - uint8_t *bit_buffer; - uint32_t bit_offset; -}; - -int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb); - -uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); - -void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); - -void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); - -void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); - -void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, - uint32_t data, int bits); - -void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, - int bits); - -void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, - int bits); - -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h deleted file mode 100644 index fd87dc181..000000000 --- a/third_party/aom/aom_dsp/blend.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BLEND_H_ -#define AOM_AOM_DSP_BLEND_H_ - -#include "aom_ports/mem.h" - -// Various blending functions and macros. -// See also the aom_blend_* functions in aom_dsp_rtcd.h - -// Alpha blending with alpha values from the range [0, 64], where 64 -// means use the first input and 0 means use the second input. - -#define AOM_BLEND_A64_ROUND_BITS 6 -#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 - -#define AOM_BLEND_A64(a, v0, v1) \ - ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ - AOM_BLEND_A64_ROUND_BITS) - -// Alpha blending with alpha values from the range [0, 256], where 256 -// means use the first input and 0 means use the second input. -#define AOM_BLEND_A256_ROUND_BITS 8 -#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 - -#define AOM_BLEND_A256(a, v0, v1) \ - ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ - AOM_BLEND_A256_ROUND_BITS) - -// Blending by averaging. -#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) - -#define DIFF_FACTOR_LOG2 4 -#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) - -#endif // AOM_AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c deleted file mode 100644 index 0554b43d1..000000000 --- a/third_party/aom/aom_dsp/blend_a64_hmask.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" - -#include "config/aom_dsp_rtcd.h" - -void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - int i, j; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = AOM_BLEND_A64( - mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); - } - } -} - -void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int w, int h, int bd) { - int i, j; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); - const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); - (void)bd; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = AOM_BLEND_A64( - mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); - } - } -} diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c deleted file mode 100644 index 992cc5c0c..000000000 --- a/third_party/aom/aom_dsp/blend_a64_mask.c +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/blend.h" -#include "aom_dsp/aom_dsp_common.h" - -#include "config/aom_dsp_rtcd.h" - -// Blending with alpha mask. Mask values come from the range [0, 64], -// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can -// be the same as dst, or dst can be different from both sources. - -// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are -// in a higher intermediate precision, and will later be rounded down to pixel -// precision. -// Thus, in order to avoid double-rounding, we want to use normal right shifts -// within this function, not ROUND_POWER_OF_TWO. -// This works because of the identity: -// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z) -// -// In contrast, the output of the non-d32 functions will not be further rounded, -// so we *should* use ROUND_POWER_OF_TWO there. - -void aom_lowbd_blend_a64_d16_mask_c( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, - ConvolveParams *conv_params) { - int i, j; - const int bd = 8; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - - assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - int32_t res; - const int m = mask[i * mask_stride + j]; - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); - res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); - } - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - int32_t res; - const int m = ROUND_POWER_OF_TWO( - mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); - res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); - } - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - int32_t res; - const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], - mask[i * mask_stride + (2 * j + 1)]); - res = ((m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); - res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); - } - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - int32_t res; - const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], - mask[(2 * i + 1) * mask_stride + j]); - res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + - (AOM_BLEND_A64_MAX_ALPHA - m) * - (int32_t)src1[i * src1_stride + j]) >> - AOM_BLEND_A64_ROUND_BITS); - res -= round_offset; - dst[i * dst_stride + j] = - clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); - } - } - } -} - -void aom_highbd_blend_a64_d16_mask_c( - uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, - ConvolveParams *conv_params, const int bd) { - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - // excerpt from clip_pixel_highbd() - // set saturation_value to (1 << bd) - 1 - unsigned int saturation_value; - switch (bd) { - case 8: - default: saturation_value = 255; break; - case 10: saturation_value = 1023; break; - case 12: saturation_value = 4095; break; - } - - if (subw == 0 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int32_t res; - const int m = mask[j]; - res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> - AOM_BLEND_A64_ROUND_BITS); - res -= round_offset; - unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); - dst[j] = AOMMIN(v, saturation_value); - } - mask += mask_stride; - src0 += src0_stride; - src1 += src1_stride; - dst += dst_stride; - } - } else if (subw == 1 && subh == 1) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int32_t res; - const int m = ROUND_POWER_OF_TWO( - mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + - mask[mask_stride + 2 * j + 1], - 2); - res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> - AOM_BLEND_A64_ROUND_BITS; - res -= round_offset; - unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); - dst[j] = AOMMIN(v, saturation_value); - } - mask += 2 * mask_stride; - src0 += src0_stride; - src1 += src1_stride; - dst += dst_stride; - } - } else if (subw == 1 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int32_t res; - const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); - res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> - AOM_BLEND_A64_ROUND_BITS; - res -= round_offset; - unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); - dst[j] = AOMMIN(v, saturation_value); - } - mask += mask_stride; - src0 += src0_stride; - src1 += src1_stride; - dst += dst_stride; - } - } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int32_t res; - const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); - res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> - AOM_BLEND_A64_ROUND_BITS; - res -= round_offset; - unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); - dst[j] = AOMMIN(v, saturation_value); - } - mask += 2 * mask_stride; - src0 += src0_stride; - src1 += src1_stride; - dst += dst_stride; - } - } -} - -// Blending with alpha mask. Mask values come from the range [0, 64], -// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can -// be the same as dst, or dst can be different from both sources. - -void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subw, int subh) { - int i, j; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = mask[i * mask_stride + j]; - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = ROUND_POWER_OF_TWO( - mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], - mask[i * mask_stride + (2 * j + 1)]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], - mask[(2 * i + 1) * mask_stride + j]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } -} - -void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int w, int h, int subw, int subh, int bd) { - int i, j; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); - const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); - (void)bd; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - - if (subw == 0 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = mask[i * mask_stride + j]; - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else if (subw == 1 && subh == 1) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = ROUND_POWER_OF_TWO( - mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else if (subw == 1 && subh == 0) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], - mask[i * mask_stride + (2 * j + 1)]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], - mask[(2 * i + 1) * mask_stride + j]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } - } -} diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c deleted file mode 100644 index 4f222e17f..000000000 --- a/third_party/aom/aom_dsp/blend_a64_vmask.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" - -#include "config/aom_dsp_rtcd.h" - -void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - int i, j; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - for (i = 0; i < h; ++i) { - const int m = mask[i]; - for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } -} - -void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int w, int h, int bd) { - int i, j; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); - const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); - const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); - (void)bd; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - - for (i = 0; i < h; ++i) { - const int m = mask[i]; - for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); - } - } -} diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c deleted file mode 100644 index f7703dffc..000000000 --- a/third_party/aom/aom_dsp/buf_ans.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <string.h> - -#include "aom_dsp/buf_ans.h" -#include "aom_mem/aom_mem.h" -#include "aom/internal/aom_codec_internal.h" - -void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error) { - c->error = error; - assert(c->size > 1); - AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf))); - // Initialize to overfull to trigger the assert in write. - c->offset = c->size + 1; -} - -void aom_buf_ans_free(struct BufAnsCoder *c) { - aom_free(c->buf); - c->buf = NULL; - c->size = 0; -} - -#if !ANS_MAX_SYMBOLS -void aom_buf_ans_grow(struct BufAnsCoder *c) { - struct buffered_ans_symbol *new_buf = NULL; - int new_size = c->size * 2; - AOM_CHECK_MEM_ERROR(c->error, new_buf, - aom_malloc(new_size * sizeof(*new_buf))); - memcpy(new_buf, c->buf, c->size * sizeof(*c->buf)); - aom_free(c->buf); - c->buf = new_buf; - c->size = new_size; -} -#endif - -void aom_buf_ans_flush(struct BufAnsCoder *const c) { - int offset; -#if ANS_MAX_SYMBOLS - if (c->offset == 0) return; -#endif - assert(c->offset > 0); - offset = c->offset - 1; - // Code the first symbol such that it brings the state to the smallest normal - // state from an initial state that would have been a subnormal/refill state. - if (c->buf[offset].method == ANS_METHOD_RANS) { - c->ans.state += c->buf[offset].val_start; - } else { - c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0; - } - for (offset = offset - 1; offset >= 0; --offset) { - if (c->buf[offset].method == ANS_METHOD_RANS) { - rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob); - } else { - rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start, - (AnsP8)c->buf[offset].prob); - } - } - c->offset = 0; - c->output_bytes += ans_write_end(&c->ans); -} diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h deleted file mode 100644 index 985fcdf9e..000000000 --- a/third_party/aom/aom_dsp/buf_ans.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_BUF_ANS_H_ -#define AOM_AOM_DSP_BUF_ANS_H_ -// Buffered forward ANS writer. -// Symbols are written to the writer in forward (decode) order and serialized -// backwards due to ANS's stack like behavior. - -#include <assert.h> -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/ans.h" -#include "aom_dsp/answriter.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -#define ANS_METHOD_RABS 0 -#define ANS_METHOD_RANS 1 - -struct buffered_ans_symbol { - unsigned int method : 1; // one of ANS_METHOD_RABS or ANS_METHOD_RANS - // TODO(aconverse): Should be possible to write this in terms of start for ABS - unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS - // start in symbol cycle for Rans - unsigned int prob : RANS_PROB_BITS; // Probability of this symbol -}; - -struct BufAnsCoder { - struct aom_internal_error_info *error; - struct buffered_ans_symbol *buf; - struct AnsCoder ans; - int size; - int offset; - int output_bytes; -#if ANS_MAX_SYMBOLS - int window_size; -#endif - int pos; // Dummy variable to store the output buffer after closing - uint8_t allow_update_cdf; -}; - -// Allocate a buffered ANS coder to store size symbols. -// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS -// partition. -// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the -// buffer will grow on demand -void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error); - -void aom_buf_ans_free(struct BufAnsCoder *c); - -#if !ANS_MAX_SYMBOLS -void aom_buf_ans_grow(struct BufAnsCoder *c); -#endif - -void aom_buf_ans_flush(struct BufAnsCoder *const c); - -static INLINE void buf_ans_write_init(struct BufAnsCoder *const c, - uint8_t *const output_buffer) { - c->offset = 0; - c->output_bytes = 0; - ans_write_init(&c->ans, output_buffer); -} - -static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val, - AnsP8 prob) { - assert(c->offset <= c->size); -#if !ANS_MAX_SYMBOLS - if (c->offset == c->size) { - aom_buf_ans_grow(c); - } -#endif - c->buf[c->offset].method = ANS_METHOD_RABS; - c->buf[c->offset].val_start = val; - c->buf[c->offset].prob = prob; - ++c->offset; -#if ANS_MAX_SYMBOLS - if (c->offset == c->size) aom_buf_ans_flush(c); -#endif -} - -// Buffer one symbol for encoding using rANS. -// cum_prob: The cumulative probability before this symbol (the offset of -// the symbol in the symbol cycle) -// prob: The probability of this symbol (l_s from the paper) -// RANS_PRECISION takes the place of m from the paper. -static INLINE void buf_rans_write(struct BufAnsCoder *const c, - aom_cdf_prob cum_prob, aom_cdf_prob prob) { - assert(c->offset <= c->size); -#if !ANS_MAX_SYMBOLS - if (c->offset == c->size) { - aom_buf_ans_grow(c); - } -#endif - c->buf[c->offset].method = ANS_METHOD_RANS; - c->buf[c->offset].val_start = cum_prob; - c->buf[c->offset].prob = prob; - ++c->offset; -#if ANS_MAX_SYMBOLS - if (c->offset == c->size) aom_buf_ans_flush(c); -#endif -} - -static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) { - buf_rabs_write(c, bit, 128); -} - -static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal, - int bits) { - int bit; - - assert(bits < 31); - for (bit = bits - 1; bit >= 0; bit--) - buf_rabs_write_bit(c, 1 & (literal >> bit)); -} - -static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) { - assert(c->offset == 0); - return c->output_bytes; -} -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus -#endif // AOM_AOM_DSP_BUF_ANS_H_ diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c deleted file mode 100644 index 6c2259f23..000000000 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/daalaboolreader.h" - -int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { - if (size && !buffer) { - return 1; - } - r->buffer_end = buffer + size; - r->buffer = buffer; - od_ec_dec_init(&r->ec, buffer, size); -#if CONFIG_ACCOUNTING - r->accounting = NULL; -#endif - return 0; -} - -const uint8_t *aom_daala_reader_find_begin(daala_reader *r) { - return r->buffer; -} - -const uint8_t *aom_daala_reader_find_end(daala_reader *r) { - return r->buffer_end; -} - -uint32_t aom_daala_reader_tell(const daala_reader *r) { - return od_ec_dec_tell(&r->ec); -} - -uint32_t aom_daala_reader_tell_frac(const daala_reader *r) { - return od_ec_dec_tell_frac(&r->ec); -} - -int aom_daala_reader_has_overflowed(const daala_reader *r) { - const uint32_t tell_bits = aom_daala_reader_tell(r); - const uint32_t tell_bytes = (tell_bits + 7) >> 3; - return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); -} diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h deleted file mode 100644 index ba78f916d..000000000 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_ -#define AOM_AOM_DSP_DAALABOOLREADER_H_ - -#include "aom/aom_integer.h" -#include "aom_dsp/entdec.h" -#include "aom_dsp/prob.h" -#if CONFIG_ACCOUNTING -#include "av1/decoder/accounting.h" -#endif -#if CONFIG_BITSTREAM_DEBUG -#include <stdio.h> -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG - -#ifdef __cplusplus -extern "C" { -#endif - -struct daala_reader { - const uint8_t *buffer; - const uint8_t *buffer_end; - od_ec_dec ec; -#if CONFIG_ACCOUNTING - Accounting *accounting; -#endif - uint8_t allow_update_cdf; -}; - -typedef struct daala_reader daala_reader; - -int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size); -const uint8_t *aom_daala_reader_find_begin(daala_reader *r); -const uint8_t *aom_daala_reader_find_end(daala_reader *r); -uint32_t aom_daala_reader_tell(const daala_reader *r); -uint32_t aom_daala_reader_tell_frac(const daala_reader *r); -// Returns true if the reader has tried to decode more data from the buffer -// than was actually provided. -int aom_daala_reader_has_overflowed(const daala_reader *r); - -static INLINE int aom_daala_read(daala_reader *r, int prob) { - int bit; - int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#if CONFIG_BITSTREAM_DEBUG -/*{ - const int queue_r = bitstream_queue_get_read(); - const int frame_idx = bitstream_queue_get_frame_read(); - if (frame_idx == 0 && queue_r == 0) { - fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n", - frame_idx, queue_r); - } -}*/ -#endif - - bit = od_ec_decode_bool_q15(&r->ec, p); - -#if CONFIG_BITSTREAM_DEBUG - { - int i; - int ref_bit, ref_nsymbs; - aom_cdf_prob ref_cdf[16]; - const int queue_r = bitstream_queue_get_read(); - const int frame_idx = bitstream_queue_get_frame_read(); - bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); - if (ref_nsymbs != 2) { - fprintf(stderr, - "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " - "%d queue_r %d\n", - frame_idx, 2, ref_nsymbs, queue_r); - assert(0); - } - if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || - (ref_cdf[1] != 32767)) { - fprintf(stderr, - "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", - frame_idx, p, 32767, ref_cdf[0]); - for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); - fprintf(stderr, "} queue_r %d\n", queue_r); - assert(0); - } - if (bit != ref_bit) { - fprintf(stderr, - "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " - "queue_r %d\n", - frame_idx, bit, ref_bit, queue_r); - assert(0); - } - } -#endif - - return bit; -} - -static INLINE int aom_daala_reader_has_error(daala_reader *r) { - return r->ec.error; -} - -static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, - int nsymbs) { - int symb; - assert(cdf != NULL); - symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); - -#if CONFIG_BITSTREAM_DEBUG - { - int i; - int cdf_error = 0; - int ref_symb, ref_nsymbs; - aom_cdf_prob ref_cdf[16]; - const int queue_r = bitstream_queue_get_read(); - const int frame_idx = bitstream_queue_get_frame_read(); - bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); - if (nsymbs != ref_nsymbs) { - fprintf(stderr, - "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " - "queue_r %d\n", - frame_idx, nsymbs, ref_nsymbs, queue_r); - cdf_error = 0; - assert(0); - } else { - for (i = 0; i < nsymbs; ++i) - if (cdf[i] != ref_cdf[i]) cdf_error = 1; - } - if (cdf_error) { - fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, - cdf[0]); - for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); - fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); - for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); - fprintf(stderr, "} queue_r %d\n", queue_r); - assert(0); - } - if (symb != ref_symb) { - fprintf( - stderr, - "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", - frame_idx, symb, ref_symb, queue_r); - assert(0); - } - } -#endif - - return symb; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_DAALABOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c deleted file mode 100644 index b24ffbf3f..000000000 --- a/third_party/aom/aom_dsp/daalaboolwriter.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <string.h> -#include "aom_dsp/daalaboolwriter.h" - -void aom_daala_start_encode(daala_writer *br, uint8_t *source) { - br->buffer = source; - br->pos = 0; - od_ec_enc_init(&br->ec, 62025); -} - -int aom_daala_stop_encode(daala_writer *br) { - int nb_bits; - uint32_t daala_bytes; - unsigned char *daala_data; - daala_data = od_ec_enc_done(&br->ec, &daala_bytes); - nb_bits = od_ec_enc_tell(&br->ec); - memcpy(br->buffer, daala_data, daala_bytes); - br->pos = daala_bytes; - od_ec_enc_clear(&br->ec); - return nb_bits; -} diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h deleted file mode 100644 index 3848877ce..000000000 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_ -#define AOM_AOM_DSP_DAALABOOLWRITER_H_ - -#include <stdio.h> - -#include "aom_dsp/entenc.h" -#include "aom_dsp/prob.h" -#if CONFIG_BITSTREAM_DEBUG -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG - -#ifdef __cplusplus -extern "C" { -#endif - -struct daala_writer { - unsigned int pos; - uint8_t *buffer; - od_ec_enc ec; - uint8_t allow_update_cdf; -}; - -typedef struct daala_writer daala_writer; - -void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); -int aom_daala_stop_encode(daala_writer *w); - -static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { - int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#if CONFIG_BITSTREAM_DEBUG - aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; - /*int queue_r = 0; - int frame_idx_r = 0; - int queue_w = bitstream_queue_get_write(); - int frame_idx_w = bitstream_queue_get_frame_write(); - if (frame_idx_w == frame_idx_r && queue_w == queue_r) { - fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", - frame_idx_w, queue_w); - }*/ - bitstream_queue_push(bit, cdf, 2); -#endif - - od_ec_encode_bool_q15(&w->ec, bit, p); -} - -static INLINE void daala_write_symbol(daala_writer *w, int symb, - const aom_cdf_prob *cdf, int nsymbs) { -#if CONFIG_BITSTREAM_DEBUG - /*int queue_r = 0; - int frame_idx_r = 0; - int queue_w = bitstream_queue_get_write(); - int frame_idx_w = bitstream_queue_get_frame_write(); - if (frame_idx_w == frame_idx_r && queue_w == queue_r) { - fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", - frame_idx_w, queue_w); - }*/ - bitstream_queue_push(symb, cdf, nsymbs); -#endif - - od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_DAALABOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c deleted file mode 100644 index aad96c6fc..000000000 --- a/third_party/aom/aom_dsp/entcode.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/entcode.h" - -/*Given the current total integer number of bits used and the current value of - rng, computes the fraction number of bits used to OD_BITRES precision. - This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). - nbits_total: The number of whole bits currently used, i.e., the value - returned by od_ec_enc_tell() or od_ec_dec_tell(). - rng: The current value of rng from either the encoder or decoder state. - Return: The number of bits scaled by 2**OD_BITRES. - This will always be slightly larger than the exact value (e.g., all - rounding error is in the positive direction).*/ -uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { - uint32_t nbits; - int l; - int i; - /*To handle the non-integral number of bits still left in the encoder/decoder - state, we compute the worst-case number of bits of val that must be - encoded to ensure that the value is inside the range for any possible - subsequent bits. - The computation here is independent of val itself (the decoder does not - even track that value), even though the real number of bits used after - od_ec_enc_done() may be 1 smaller if rng is a power of two and the - corresponding trailing bits of val are all zeros. - If we did try to track that special case, then coding a value with a - probability of 1/(1 << n) might sometimes appear to use more than n bits. - This may help explain the surprising result that a newly initialized - encoder or decoder claims to have used 1 bit.*/ - nbits = nbits_total << OD_BITRES; - l = 0; - for (i = OD_BITRES; i-- > 0;) { - int b; - rng = rng * rng >> 15; - b = (int)(rng >> 16); - l = l << 1 | b; - rng >>= b; - } - return nbits - l; -} diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h deleted file mode 100644 index 7ba2b1c39..000000000 --- a/third_party/aom/aom_dsp/entcode.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_ENTCODE_H_ -#define AOM_AOM_DSP_ENTCODE_H_ - -#include <limits.h> -#include <stddef.h> -#include "av1/common/odintrin.h" -#include "aom_dsp/prob.h" - -#define EC_PROB_SHIFT 6 -#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16 - -/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic - on a larger type, you can speed up the decoder by using it here.*/ -typedef uint32_t od_ec_window; - -#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT) - -/*The resolution of fractional-precision bit usage measurements, i.e., - 3 => 1/8th bits.*/ -#define OD_BITRES (3) - -#define OD_ICDF AOM_ICDF - -/*See entcode.c for further documentation.*/ - -OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, - uint32_t rng); - -#endif // AOM_AOM_DSP_ENTCODE_H_ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c deleted file mode 100644 index d1764c47b..000000000 --- a/third_party/aom/aom_dsp/entdec.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "aom_dsp/entdec.h" -#include "aom_dsp/prob.h" - -/*A range decoder. - This is an entropy decoder based upon \cite{Mar79}, which is itself a - rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. - It is very similar to arithmetic encoding, except that encoding is done with - digits in any base, instead of with bits, and so it is faster when using - larger bases (i.e.: a byte). - The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ - is the base, longer than the theoretical optimum, but to my knowledge there - is no published justification for this claim. - This only seems true when using near-infinite precision arithmetic so that - the process is carried out with no rounding errors. - - An excellent description of implementation details is available at - http://www.arturocampos.com/ac_range.html - A recent work \cite{MNW98} which proposes several changes to arithmetic - encoding for efficiency actually re-discovers many of the principles - behind range encoding, and presents a good theoretical analysis of them. - - End of stream is handled by writing out the smallest number of bits that - ensures that the stream will be correctly decoded regardless of the value of - any subsequent bits. - od_ec_dec_tell() can be used to determine how many bits were needed to decode - all the symbols thus far; other data can be packed in the remaining bits of - the input buffer. - @PHDTHESIS{Pas76, - author="Richard Clark Pasco", - title="Source coding algorithms for fast data compression", - school="Dept. of Electrical Engineering, Stanford University", - address="Stanford, CA", - month=May, - year=1976, - URL="http://www.richpasco.org/scaffdc.pdf" - } - @INPROCEEDINGS{Mar79, - author="Martin, G.N.N.", - title="Range encoding: an algorithm for removing redundancy from a digitised - message", - booktitle="Video & Data Recording Conference", - year=1979, - address="Southampton", - month=Jul, - URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" - } - @ARTICLE{MNW98, - author="Alistair Moffat and Radford Neal and Ian H. Witten", - title="Arithmetic Coding Revisited", - journal="{ACM} Transactions on Information Systems", - year=1998, - volume=16, - number=3, - pages="256--294", - month=Jul, - URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" - }*/ - -/*This is meant to be a large, positive constant that can still be efficiently - loaded as an immediate (on platforms like ARM, for example). - Even relatively modest values like 100 would work fine.*/ -#define OD_EC_LOTS_OF_BITS (0x4000) - -/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill - call.*/ -static void od_ec_dec_refill(od_ec_dec *dec) { - int s; - od_ec_window dif; - int16_t cnt; - const unsigned char *bptr; - const unsigned char *end; - dif = dec->dif; - cnt = dec->cnt; - bptr = dec->bptr; - end = dec->end; - s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); - for (; s >= 0 && bptr < end; s -= 8, bptr++) { - assert(s <= OD_EC_WINDOW_SIZE - 8); - dif ^= (od_ec_window)bptr[0] << s; - cnt += 8; - } - if (bptr >= end) { - dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; - cnt = OD_EC_LOTS_OF_BITS; - } - dec->dif = dif; - dec->cnt = cnt; - dec->bptr = bptr; -} - -/*Takes updated dif and range values, renormalizes them so that - 32768 <= rng < 65536 (reading more bytes from the stream into dif if - necessary), and stores them back in the decoder context. - dif: The new value of dif. - rng: The new value of the range. - ret: The value to return. - Return: ret. - This allows the compiler to jump to this function via a tail-call.*/ -static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, - int ret) { - int d; - assert(rng <= 65535U); - // The number of leading zeros in the 16-bit binary representation of rng. - d = 16 - OD_ILOG_NZ(rng); - dec->cnt -= d; - /*This is equivalent to shifting in 1's instead of 0's.*/ - dec->dif = ((dif + 1) << d) - 1; - dec->rng = rng << d; - if (dec->cnt < 0) od_ec_dec_refill(dec); - return ret; -} - -/*Initializes the decoder. - buf: The input buffer to use. - Return: 0 on success, or a negative value on error.*/ -void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, - uint32_t storage) { - dec->buf = buf; - dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); - dec->end = buf + storage; - dec->bptr = buf; - dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; - dec->rng = 0x8000; - dec->cnt = -15; - dec->error = 0; - od_ec_dec_refill(dec); -} - -/*Decode a single binary value. - f: The probability that the bit is one, scaled by 32768. - Return: The value decoded (0 or 1).*/ -int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { - od_ec_window dif; - od_ec_window vw; - unsigned r; - unsigned r_new; - unsigned v; - int ret; - assert(0 < f); - assert(f < 32768U); - dif = dec->dif; - r = dec->rng; - assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - assert(32768U <= r); - v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); - v += EC_MIN_PROB; - vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); - ret = 1; - r_new = v; - if (dif >= vw) { - r_new = r - v; - dif -= vw; - ret = 0; - } - return od_ec_dec_normalize(dec, dif, r_new, ret); -} - -/*Decodes a symbol given an inverse cumulative distribution function (CDF) - table in Q15. - icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range - [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]). - The values must be monotonically non-increasing, and icdf[nsyms - 1] - must be 0. - nsyms: The number of symbols in the alphabet. - This should be at most 16. - Return: The decoded symbol s.*/ -int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { - od_ec_window dif; - unsigned r; - unsigned c; - unsigned u; - unsigned v; - int ret; - (void)nsyms; - dif = dec->dif; - r = dec->rng; - const int N = nsyms - 1; - - assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); - assert(32768U <= r); - assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); - c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); - v = r; - ret = -1; - do { - u = v; - v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >> - (7 - EC_PROB_SHIFT - CDF_SHIFT)); - v += EC_MIN_PROB * (N - ret); - } while (c < v); - assert(v < u); - assert(u <= r); - r = u - v; - dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); - return od_ec_dec_normalize(dec, dif, r, ret); -} - -/*Returns the number of bits "used" by the decoded symbols so far. - This same number can be computed in either the encoder or the decoder, and is - suitable for making coding decisions. - Return: The number of bits. - This will always be slightly larger than the exact value (e.g., all - rounding error is in the positive direction).*/ -int od_ec_dec_tell(const od_ec_dec *dec) { - return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs); -} - -/*Returns the number of bits "used" by the decoded symbols so far. - This same number can be computed in either the encoder or the decoder, and is - suitable for making coding decisions. - Return: The number of bits scaled by 2**OD_BITRES. - This will always be slightly larger than the exact value (e.g., all - rounding error is in the positive direction).*/ -uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { - return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); -} diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h deleted file mode 100644 index 283bf1831..000000000 --- a/third_party/aom/aom_dsp/entdec.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_ENTDEC_H_ -#define AOM_AOM_DSP_ENTDEC_H_ -#include <limits.h> -#include "aom_dsp/entcode.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct od_ec_dec od_ec_dec; - -#if defined(OD_ACCOUNTING) && OD_ACCOUNTING -#define OD_ACC_STR , char *acc_str -#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) -#else -#define OD_ACC_STR -#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) -#endif - -/*The entropy decoder context.*/ -struct od_ec_dec { - /*The start of the current input buffer.*/ - const unsigned char *buf; - /*An offset used to keep track of tell after reaching the end of the stream. - This is constant throughout most of the decoding process, but becomes - important once we hit the end of the buffer and stop incrementing pointers - (and instead pretend cnt has lots of bits).*/ - int32_t tell_offs; - /*The end of the current input buffer.*/ - const unsigned char *end; - /*The read pointer for the entropy-coded bits.*/ - const unsigned char *bptr; - /*The difference between the high end of the current range, (low + rng), and - the coded value, minus 1. - This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the - decoder only uses the top 16 bits of the window to decode the next symbol. - As we shift up during renormalization, if we don't have enough bits left in - the window to fill the top 16, we'll read in more bits of the coded - value.*/ - od_ec_window dif; - /*The number of values in the current range.*/ - uint16_t rng; - /*The number of bits of data in the current value.*/ - int16_t cnt; - /*Nonzero if an error occurred.*/ - int error; -}; - -/*See entdec.c for further documentation.*/ - -void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) - OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); - -OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) - OD_ARG_NONNULL(1); -OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, - const uint16_t *cdf, int nsyms) - OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); - -OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) - OD_ARG_NONNULL(1); - -OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) - OD_ARG_NONNULL(1); -OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) - OD_ARG_NONNULL(1); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_ENTDEC_H_ diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c deleted file mode 100644 index a61da263c..000000000 --- a/third_party/aom/aom_dsp/entenc.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include <assert.h> -#include "aom_dsp/entenc.h" -#include "aom_dsp/prob.h" - -#if OD_MEASURE_EC_OVERHEAD -#if !defined(M_LOG2E) -#define M_LOG2E (1.4426950408889634073599246810019) -#endif -#define OD_LOG2(x) (M_LOG2E * log(x)) -#endif // OD_MEASURE_EC_OVERHEAD - -/*A range encoder. - See entdec.c and the references for implementation details \cite{Mar79,MNW98}. - - @INPROCEEDINGS{Mar79, - author="Martin, G.N.N.", - title="Range encoding: an algorithm for removing redundancy from a digitised - message", - booktitle="Video \& Data Recording Conference", - year=1979, - address="Southampton", - month=Jul, - URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" - } - @ARTICLE{MNW98, - author="Alistair Moffat and Radford Neal and Ian H. Witten", - title="Arithmetic Coding Revisited", - journal="{ACM} Transactions on Information Systems", - year=1998, - volume=16, - number=3, - pages="256--294", - month=Jul, - URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" - }*/ - -/*Takes updated low and range values, renormalizes them so that - 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if - necessary), and stores them back in the encoder context. - low: The new value of low. - rng: The new value of the range.*/ -static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, - unsigned rng) { - int d; - int c; - int s; - c = enc->cnt; - assert(rng <= 65535U); - // The number of leading zeros in the 16-bit binary representation of rng. - d = 16 - OD_ILOG_NZ(rng); - s = c + d; - /*TODO: Right now we flush every time we have at least one byte available. - Instead we should use an od_ec_window and flush right before we're about to - shift bits off the end of the window. - For a 32-bit window this is about the same amount of work, but for a 64-bit - window it should be a fair win.*/ - if (s >= 0) { - uint16_t *buf; - uint32_t storage; - uint32_t offs; - unsigned m; - buf = enc->precarry_buf; - storage = enc->precarry_storage; - offs = enc->offs; - if (offs + 2 > storage) { - storage = 2 * storage + 2; - buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); - if (buf == NULL) { - enc->error = -1; - enc->offs = 0; - return; - } - enc->precarry_buf = buf; - enc->precarry_storage = storage; - } - c += 16; - m = (1 << c) - 1; - if (s >= 8) { - assert(offs < storage); - buf[offs++] = (uint16_t)(low >> c); - low &= m; - c -= 8; - m >>= 8; - } - assert(offs < storage); - buf[offs++] = (uint16_t)(low >> c); - s = c + d - 24; - low &= m; - enc->offs = offs; - } - enc->low = low << d; - enc->rng = rng << d; - enc->cnt = s; -} - -/*Initializes the encoder. - size: The initial size of the buffer, in bytes.*/ -void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { - od_ec_enc_reset(enc); - enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); - enc->storage = size; - if (size > 0 && enc->buf == NULL) { - enc->storage = 0; - enc->error = -1; - } - enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size); - enc->precarry_storage = size; - if (size > 0 && enc->precarry_buf == NULL) { - enc->precarry_storage = 0; - enc->error = -1; - } -} - -/*Reinitializes the encoder.*/ -void od_ec_enc_reset(od_ec_enc *enc) { - enc->offs = 0; - enc->low = 0; - enc->rng = 0x8000; - /*This is initialized to -9 so that it crosses zero after we've accumulated - one byte + one carry bit.*/ - enc->cnt = -9; - enc->error = 0; -#if OD_MEASURE_EC_OVERHEAD - enc->entropy = 0; - enc->nb_symbols = 0; -#endif -} - -/*Frees the buffers used by the encoder.*/ -void od_ec_enc_clear(od_ec_enc *enc) { - free(enc->precarry_buf); - free(enc->buf); -} - -/*Encodes a symbol given its frequency in Q15. - fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come - before the - one to be encoded. - fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and - including - the one to be encoded.*/ -static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s, - int nsyms) { - od_ec_window l; - unsigned r; - unsigned u; - unsigned v; - l = enc->low; - r = enc->rng; - assert(32768U <= r); - assert(fh <= fl); - assert(fl <= 32768U); - assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); - const int N = nsyms - 1; - if (fl < CDF_PROB_TOP) { - u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> - (7 - EC_PROB_SHIFT - CDF_SHIFT)) + - EC_MIN_PROB * (N - (s - 1)); - v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> - (7 - EC_PROB_SHIFT - CDF_SHIFT)) + - EC_MIN_PROB * (N - (s + 0)); - l += r - u; - r = u - v; - } else { - r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> - (7 - EC_PROB_SHIFT - CDF_SHIFT)) + - EC_MIN_PROB * (N - (s + 0)); - } - od_ec_enc_normalize(enc, l, r); -#if OD_MEASURE_EC_OVERHEAD - enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.); - enc->nb_symbols++; -#endif -} - -/*Encode a single binary value. - val: The value to encode (0 or 1). - f: The probability that the val is one, scaled by 32768.*/ -void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { - od_ec_window l; - unsigned r; - unsigned v; - assert(0 < f); - assert(f < 32768U); - l = enc->low; - r = enc->rng; - assert(32768U <= r); - v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); - v += EC_MIN_PROB; - if (val) l += r - v; - r = val ? v : r - v; - od_ec_enc_normalize(enc, l, r); -#if OD_MEASURE_EC_OVERHEAD - enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.); - enc->nb_symbols++; -#endif -} - -/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. - s: The index of the symbol to encode. - icdf: 32768 minus the CDF, such that symbol s falls in the range - [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). - The values must be monotonically decreasing, and icdf[nsyms - 1] must - be 0. - nsyms: The number of symbols in the alphabet. - This should be at most 16.*/ -void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, - int nsyms) { - (void)nsyms; - assert(s >= 0); - assert(s < nsyms); - assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); - od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms); -} - -/*Overwrites a few bits at the very start of an existing stream, after they - have already been encoded. - This makes it possible to have a few flags up front, where it is easy for - decoders to access them without parsing the whole stream, even if their - values are not determined until late in the encoding process, without having - to buffer all the intermediate symbols in the encoder. - In order for this to work, at least nbits bits must have already been encoded - using probabilities that are an exact power of two. - The encoder can verify the number of encoded bits is sufficient, but cannot - check this latter condition. - val: The bits to encode (in the least nbits significant bits). - They will be decoded in order from most-significant to least. - nbits: The number of bits to overwrite. - This must be no more than 8.*/ -void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { - int shift; - unsigned mask; - assert(nbits >= 0); - assert(nbits <= 8); - assert(val < 1U << nbits); - shift = 8 - nbits; - mask = ((1U << nbits) - 1) << shift; - if (enc->offs > 0) { - /*The first byte has been finalized.*/ - enc->precarry_buf[0] = - (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift); - } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) { - /*The first byte has yet to be output.*/ - enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) | - (od_ec_window)val << (16 + enc->cnt + shift); - } else { - /*The encoder hasn't even encoded _nbits of data yet.*/ - enc->error = -1; - } -} - -#if OD_MEASURE_EC_OVERHEAD -#include <stdio.h> -#endif - -/*Indicates that there are no more symbols to encode. - All remaining output bytes are flushed to the output buffer. - od_ec_enc_reset() should be called before using the encoder again. - bytes: Returns the size of the encoded data in the returned buffer. - Return: A pointer to the start of the final buffer, or NULL if there was an - encoding error.*/ -unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { - unsigned char *out; - uint32_t storage; - uint16_t *buf; - uint32_t offs; - od_ec_window m; - od_ec_window e; - od_ec_window l; - int c; - int s; - if (enc->error) return NULL; -#if OD_MEASURE_EC_OVERHEAD - { - uint32_t tell; - /* Don't count the 1 bit we lose to raw bits as overhead. */ - tell = od_ec_enc_tell(enc) - 1; - fprintf(stderr, "overhead: %f%%\n", - 100 * (tell - enc->entropy) / enc->entropy); - fprintf(stderr, "efficiency: %f bits/symbol\n", - (double)tell / enc->nb_symbols); - } -#endif - /*We output the minimum number of bits that ensures that the symbols encoded - thus far will be decoded correctly regardless of the bits that follow.*/ - l = enc->low; - c = enc->cnt; - s = 10; - m = 0x3FFF; - e = ((l + m) & ~m) | (m + 1); - s += c; - offs = enc->offs; - buf = enc->precarry_buf; - if (s > 0) { - unsigned n; - storage = enc->precarry_storage; - if (offs + ((s + 7) >> 3) > storage) { - storage = storage * 2 + ((s + 7) >> 3); - buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); - if (buf == NULL) { - enc->error = -1; - return NULL; - } - enc->precarry_buf = buf; - enc->precarry_storage = storage; - } - n = (1 << (c + 16)) - 1; - do { - assert(offs < storage); - buf[offs++] = (uint16_t)(e >> (c + 16)); - e &= n; - s -= 8; - c -= 8; - n >>= 8; - } while (s > 0); - } - /*Make sure there's enough room for the entropy-coded bits.*/ - out = enc->buf; - storage = enc->storage; - c = OD_MAXI((s + 7) >> 3, 0); - if (offs + c > storage) { - storage = offs + c; - out = (unsigned char *)realloc(out, sizeof(*out) * storage); - if (out == NULL) { - enc->error = -1; - return NULL; - } - enc->buf = out; - enc->storage = storage; - } - *nbytes = offs; - /*Perform carry propagation.*/ - assert(offs <= storage); - out = out + storage - offs; - c = 0; - while (offs > 0) { - offs--; - c = buf[offs] + c; - out[offs] = (unsigned char)c; - c >>= 8; - } - /*Note: Unless there's an allocation error, if you keep encoding into the - current buffer and call this function again later, everything will work - just fine (you won't get a new packet out, but you will get a single - buffer with the new data appended to the old). - However, this function is O(N) where N is the amount of data coded so far, - so calling it more than once for a given packet is a bad idea.*/ - return out; -} - -/*Returns the number of bits "used" by the encoded symbols so far. - This same number can be computed in either the encoder or the decoder, and is - suitable for making coding decisions. - Warning: The value returned by this function can decrease compared to an - earlier call, even after encoding more data, if there is an encoding error - (i.e., a failure to allocate enough space for the output buffer). - Return: The number of bits. - This will always be slightly larger than the exact value (e.g., all - rounding error is in the positive direction).*/ -int od_ec_enc_tell(const od_ec_enc *enc) { - /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra - bit, which we reserve for terminating the stream.*/ - return (enc->cnt + 10) + enc->offs * 8; -} - -/*Returns the number of bits "used" by the encoded symbols so far. - This same number can be computed in either the encoder or the decoder, and is - suitable for making coding decisions. - Warning: The value returned by this function can decrease compared to an - earlier call, even after encoding more data, if there is an encoding error - (i.e., a failure to allocate enough space for the output buffer). - Return: The number of bits scaled by 2**OD_BITRES. - This will always be slightly larger than the exact value (e.g., all - rounding error is in the positive direction).*/ -uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { - return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); -} - -/*Saves a entropy coder checkpoint to dst. - This allows an encoder to reverse a series of entropy coder - decisions if it decides that the information would have been - better coded some other way.*/ -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { - OD_COPY(dst, src, 1); -} - -/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. - This can only be used to restore from checkpoints earlier in the target - state's history: you can not switch backwards and forwards or otherwise - switch to a state which isn't a casual ancestor of the current state. - Restore is also incompatible with patching the initial bits, as the - changes will remain in the restored version.*/ -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { - unsigned char *buf; - uint32_t storage; - uint16_t *precarry_buf; - uint32_t precarry_storage; - assert(dst->storage >= src->storage); - assert(dst->precarry_storage >= src->precarry_storage); - buf = dst->buf; - storage = dst->storage; - precarry_buf = dst->precarry_buf; - precarry_storage = dst->precarry_storage; - OD_COPY(dst, src, 1); - dst->buf = buf; - dst->storage = storage; - dst->precarry_buf = precarry_buf; - dst->precarry_storage = precarry_storage; -} diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h deleted file mode 100644 index 3551d4250..000000000 --- a/third_party/aom/aom_dsp/entenc.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_ENTENC_H_ -#define AOM_AOM_DSP_ENTENC_H_ -#include <stddef.h> -#include "aom_dsp/entcode.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct od_ec_enc od_ec_enc; - -#define OD_MEASURE_EC_OVERHEAD (0) - -/*The entropy encoder context.*/ -struct od_ec_enc { - /*Buffered output. - This contains only the raw bits until the final call to od_ec_enc_done(), - where all the arithmetic-coded data gets prepended to it.*/ - unsigned char *buf; - /*The size of the buffer.*/ - uint32_t storage; - /*A buffer for output bytes with their associated carry flags.*/ - uint16_t *precarry_buf; - /*The size of the pre-carry buffer.*/ - uint32_t precarry_storage; - /*The offset at which the next entropy-coded byte will be written.*/ - uint32_t offs; - /*The low end of the current range.*/ - od_ec_window low; - /*The number of values in the current range.*/ - uint16_t rng; - /*The number of bits of data in the current value.*/ - int16_t cnt; - /*Nonzero if an error occurred.*/ - int error; -#if OD_MEASURE_EC_OVERHEAD - double entropy; - int nb_symbols; -#endif -}; - -/*See entenc.c for further documentation.*/ - -void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); -void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); -void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); - -void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) - OD_ARG_NONNULL(1); -void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) - OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); - -void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) - OD_ARG_NONNULL(1); - -void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) - OD_ARG_NONNULL(1); -OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, - uint32_t *nbytes) - OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); - -OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) - OD_ARG_NONNULL(1); -OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) - OD_ARG_NONNULL(1); - -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_ENTENC_H_ diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c deleted file mode 100644 index 3804519b3..000000000 --- a/third_party/aom/aom_dsp/fastssim.c +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - * - * This code was originally written by: Nathan E. Egge, at the Daala - * project. - */ -#include <assert.h> -#include <math.h> -#include <stdlib.h> -#include <string.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/ssim.h" -#include "aom_ports/system_state.h" - -typedef struct fs_level fs_level; -typedef struct fs_ctx fs_ctx; - -#define SSIM_C1 (255 * 255 * 0.01 * 0.01) -#define SSIM_C2 (255 * 255 * 0.03 * 0.03) -#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) -#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) -#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) -#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) - -#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) -#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) - -struct fs_level { - uint32_t *im1; - uint32_t *im2; - double *ssim; - int w; - int h; -}; - -struct fs_ctx { - fs_level *level; - int nlevels; - unsigned *col_buf; -}; - -static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { - unsigned char *data; - size_t data_size; - int lw; - int lh; - int l; - lw = (_w + 1) >> 1; - lh = (_h + 1) >> 1; - data_size = - _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); - for (l = 0; l < _nlevels; l++) { - size_t im_size; - size_t level_size; - im_size = lw * (size_t)lh; - level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); - level_size += sizeof(*_ctx->level[l].ssim) - 1; - level_size /= sizeof(*_ctx->level[l].ssim); - level_size += im_size; - level_size *= sizeof(*_ctx->level[l].ssim); - data_size += level_size; - lw = (lw + 1) >> 1; - lh = (lh + 1) >> 1; - } - data = (unsigned char *)malloc(data_size); - _ctx->level = (fs_level *)data; - _ctx->nlevels = _nlevels; - data += _nlevels * sizeof(*_ctx->level); - lw = (_w + 1) >> 1; - lh = (_h + 1) >> 1; - for (l = 0; l < _nlevels; l++) { - size_t im_size; - size_t level_size; - _ctx->level[l].w = lw; - _ctx->level[l].h = lh; - im_size = lw * (size_t)lh; - level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); - level_size += sizeof(*_ctx->level[l].ssim) - 1; - level_size /= sizeof(*_ctx->level[l].ssim); - level_size *= sizeof(*_ctx->level[l].ssim); - _ctx->level[l].im1 = (uint32_t *)data; - _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; - data += level_size; - _ctx->level[l].ssim = (double *)data; - data += im_size * sizeof(*_ctx->level[l].ssim); - lw = (lw + 1) >> 1; - lh = (lh + 1) >> 1; - } - _ctx->col_buf = (unsigned *)data; -} - -static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } - -static void fs_downsample_level(fs_ctx *_ctx, int _l) { - const uint32_t *src1; - const uint32_t *src2; - uint32_t *dst1; - uint32_t *dst2; - int w2; - int h2; - int w; - int h; - int i; - int j; - w = _ctx->level[_l].w; - h = _ctx->level[_l].h; - dst1 = _ctx->level[_l].im1; - dst2 = _ctx->level[_l].im2; - w2 = _ctx->level[_l - 1].w; - h2 = _ctx->level[_l - 1].h; - src1 = _ctx->level[_l - 1].im1; - src2 = _ctx->level[_l - 1].im2; - for (j = 0; j < h; j++) { - int j0offs; - int j1offs; - j0offs = 2 * j * w2; - j1offs = FS_MINI(2 * j + 1, h2) * w2; - for (i = 0; i < w; i++) { - int i0; - int i1; - i0 = 2 * i; - i1 = FS_MINI(i0 + 1, w2); - dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + - src1[j1offs + i0] + src1[j1offs + i1]; - dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + - src2[j1offs + i0] + src2[j1offs + i1]; - } - } -} - -static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, - int _s1ystride, const uint8_t *_src2, - int _s2ystride, int _w, int _h, uint32_t shift, - int buf_is_hbd) { - uint32_t *dst1; - uint32_t *dst2; - int w; - int h; - int i; - int j; - w = _ctx->level[0].w; - h = _ctx->level[0].h; - dst1 = _ctx->level[0].im1; - dst2 = _ctx->level[0].im2; - for (j = 0; j < h; j++) { - int j0; - int j1; - j0 = 2 * j; - j1 = FS_MINI(j0 + 1, _h); - for (i = 0; i < w; i++) { - int i0; - int i1; - i0 = 2 * i; - i1 = FS_MINI(i0 + 1, _w); - if (!buf_is_hbd) { - dst1[j * w + i] = - _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + - _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; - dst2[j * w + i] = - _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + - _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; - } else { - uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); - uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); - dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + - (src1s[j0 * _s1ystride + i1] >> shift) + - (src1s[j1 * _s1ystride + i0] >> shift) + - (src1s[j1 * _s1ystride + i1] >> shift); - dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + - (src2s[j0 * _s2ystride + i1] >> shift) + - (src2s[j1 * _s2ystride + i0] >> shift) + - (src2s[j1 * _s2ystride + i1] >> shift); - } - } - } -} - -static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { - unsigned *col_sums_x; - unsigned *col_sums_y; - uint32_t *im1; - uint32_t *im2; - double *ssim; - double c1; - int w; - int h; - int j0offs; - int j1offs; - int i; - int j; - double ssim_c1 = SSIM_C1; - - if (bit_depth == 10) ssim_c1 = SSIM_C1_10; - if (bit_depth == 12) ssim_c1 = SSIM_C1_12; - - w = _ctx->level[_l].w; - h = _ctx->level[_l].h; - col_sums_x = _ctx->col_buf; - col_sums_y = col_sums_x + w; - im1 = _ctx->level[_l].im1; - im2 = _ctx->level[_l].im2; - for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; - for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; - for (j = 1; j < 4; j++) { - j1offs = FS_MINI(j, h - 1) * w; - for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; - } - ssim = _ctx->level[_l].ssim; - c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); - for (j = 0; j < h; j++) { - unsigned mux; - unsigned muy; - int i0; - int i1; - mux = 5 * col_sums_x[0]; - muy = 5 * col_sums_y[0]; - for (i = 1; i < 4; i++) { - i1 = FS_MINI(i, w - 1); - mux += col_sums_x[i1]; - muy += col_sums_y[i1]; - } - for (i = 0; i < w; i++) { - ssim[j * w + i] *= (2 * mux * (double)muy + c1) / - (mux * (double)mux + muy * (double)muy + c1); - if (i + 1 < w) { - i0 = FS_MAXI(0, i - 4); - i1 = FS_MINI(i + 4, w - 1); - mux += col_sums_x[i1] - col_sums_x[i0]; - muy += col_sums_x[i1] - col_sums_x[i0]; - } - } - if (j + 1 < h) { - j0offs = FS_MAXI(0, j - 4) * w; - for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; - for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; - j1offs = FS_MINI(j + 4, h - 1) * w; - for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; - } - } -} - -#define FS_COL_SET(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ - gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] = gx * (double)gx; \ - col_sums_gy2[(_col)] = gy * (double)gy; \ - col_sums_gxgy[(_col)] = gx * (double)gy; \ - } while (0) - -#define FS_COL_ADD(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ - gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] += gx * (double)gx; \ - col_sums_gy2[(_col)] += gy * (double)gy; \ - col_sums_gxgy[(_col)] += gx * (double)gy; \ - } while (0) - -#define FS_COL_SUB(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ - gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] -= gx * (double)gx; \ - col_sums_gy2[(_col)] -= gy * (double)gy; \ - col_sums_gxgy[(_col)] -= gx * (double)gy; \ - } while (0) - -#define FS_COL_COPY(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ - col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ - } while (0) - -#define FS_COL_HALVE(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ - col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ - } while (0) - -#define FS_COL_DOUBLE(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ - col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ - } while (0) - -static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { - uint32_t *im1; - uint32_t *im2; - unsigned *gx_buf; - unsigned *gy_buf; - double *ssim; - double col_sums_gx2[8]; - double col_sums_gy2[8]; - double col_sums_gxgy[8]; - double c2; - int stride; - int w; - int h; - int i; - int j; - double ssim_c2 = SSIM_C2; - if (bit_depth == 10) ssim_c2 = SSIM_C2_10; - if (bit_depth == 12) ssim_c2 = SSIM_C2_12; - - w = _ctx->level[_l].w; - h = _ctx->level[_l].h; - im1 = _ctx->level[_l].im1; - im2 = _ctx->level[_l].im2; - ssim = _ctx->level[_l].ssim; - gx_buf = _ctx->col_buf; - stride = w + 8; - gy_buf = gx_buf + 8 * stride; - memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); - c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; - for (j = 0; j < h + 4; j++) { - if (j < h - 1) { - for (i = 0; i < w - 1; i++) { - unsigned g1; - unsigned g2; - unsigned gx; - unsigned gy; - g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); - g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); - gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); - g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); - gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - gx_buf[(j & 7) * stride + i + 4] = gx; - gy_buf[(j & 7) * stride + i + 4] = gy; - } - } else { - memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); - memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); - } - if (j >= 4) { - int k; - col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; - col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; - col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = - col_sums_gxgy[0] = 0; - for (i = 4; i < 8; i++) { - FS_COL_SET(i, -1, 0); - FS_COL_ADD(i, 0, 0); - for (k = 1; k < 8 - i; k++) { - FS_COL_DOUBLE(i, i); - FS_COL_ADD(i, -k - 1, 0); - FS_COL_ADD(i, k, 0); - } - } - for (i = 0; i < w; i++) { - double mugx2; - double mugy2; - double mugxgy; - mugx2 = col_sums_gx2[0]; - for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; - mugy2 = col_sums_gy2[0]; - for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; - mugxgy = col_sums_gxgy[0]; - for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; - ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); - if (i + 1 < w) { - FS_COL_SET(0, -1, 1); - FS_COL_ADD(0, 0, 1); - FS_COL_SUB(2, -3, 2); - FS_COL_SUB(2, 2, 2); - FS_COL_HALVE(1, 2); - FS_COL_SUB(3, -4, 3); - FS_COL_SUB(3, 3, 3); - FS_COL_HALVE(2, 3); - FS_COL_COPY(3, 4); - FS_COL_DOUBLE(4, 5); - FS_COL_ADD(4, -4, 5); - FS_COL_ADD(4, 3, 5); - FS_COL_DOUBLE(5, 6); - FS_COL_ADD(5, -3, 6); - FS_COL_ADD(5, 2, 6); - FS_COL_DOUBLE(6, 7); - FS_COL_ADD(6, -2, 7); - FS_COL_ADD(6, 1, 7); - FS_COL_SET(7, -1, 8); - FS_COL_ADD(7, 0, 8); - } - } - } - } -} - -#define FS_NLEVELS (4) - -/*These weights were derived from the default weights found in Wang's original - Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. - We drop the finest scale and renormalize the rest to sum to 1.*/ - -static const double FS_WEIGHTS[FS_NLEVELS] = { - 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 -}; - -static double fs_average(fs_ctx *_ctx, int _l) { - double *ssim; - double ret; - int w; - int h; - int i; - int j; - w = _ctx->level[_l].w; - h = _ctx->level[_l].h; - ssim = _ctx->level[_l].ssim; - ret = 0; - for (j = 0; j < h; j++) - for (i = 0; i < w; i++) ret += ssim[j * w + i]; - return pow(ret / (w * h), FS_WEIGHTS[_l]); -} - -static double convert_ssim_db(double _ssim, double _weight) { - assert(_weight >= _ssim); - if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; - return 10 * (log10(_weight) - log10(_weight - _ssim)); -} - -static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, - int _dystride, int _w, int _h, uint32_t _bd, - uint32_t _shift, int buf_is_hbd) { - fs_ctx ctx; - double ret; - int l; - ret = 1; - fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); - fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift, - buf_is_hbd); - for (l = 0; l < FS_NLEVELS - 1; l++) { - fs_calc_structure(&ctx, l, _bd); - ret *= fs_average(&ctx, l); - fs_downsample_level(&ctx, l + 1); - } - fs_calc_structure(&ctx, l, _bd); - fs_apply_luminance(&ctx, l, _bd); - ret *= fs_average(&ctx, l); - fs_ctx_clear(&ctx); - return ret; -} - -double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *ssim_y, - double *ssim_u, double *ssim_v, uint32_t bd, - uint32_t in_bd) { - double ssimv; - uint32_t bd_shift = 0; - aom_clear_system_state(); - assert(bd >= in_bd); - assert(source->flags == dest->flags); - int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH; - bd_shift = bd - in_bd; - - *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, - dest->y_stride, source->y_crop_width, - source->y_crop_height, in_bd, bd_shift, buf_is_hbd); - *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); - *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); - ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); - return convert_ssim_db(ssimv, 1.0); -} diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c deleted file mode 100644 index 0ba71cfb3..000000000 --- a/third_party/aom/aom_dsp/fft.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/fft_common.h" - -static INLINE void simple_transpose(const float *A, float *B, int n) { - for (int y = 0; y < n; y++) { - for (int x = 0; x < n; x++) { - B[y * n + x] = A[x * n + y]; - } - } -} - -// The 1d transform is real to complex and packs the complex results in -// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real -// components, followed by the n/2 - 1 imaginary components). After the -// transform is done on the rows, the first n/2 + 1 columns are real, and -// the remaining are the imaginary components. After the transform on the -// columns, the region of [0, n/2]x[0, n/2] contains the real part of -// fft of the real columns. The real part of the 2d fft also includes the -// imaginary part of transformed imaginary columns. This function assembles -// the correct outputs while putting the real and imaginary components -// next to each other. -static INLINE void unpack_2d_output(const float *col_fft, float *output, - int n) { - for (int y = 0; y <= n / 2; ++y) { - const int y2 = y + n / 2; - const int y_extra = y2 > n / 2 && y2 < n; - - for (int x = 0; x <= n / 2; ++x) { - const int x2 = x + n / 2; - const int x_extra = x2 > n / 2 && x2 < n; - output[2 * (y * n + x)] = - col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); - output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + - (x_extra ? col_fft[y * n + x2] : 0); - if (y_extra) { - output[2 * ((n - y) * n + x)] = - col_fft[y * n + x] + - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); - output[2 * ((n - y) * n + x) + 1] = - -(y_extra ? col_fft[y2 * n + x] : 0) + - (x_extra ? col_fft[y * n + x2] : 0); - } - } - } -} - -void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, - aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, - aom_fft_unpack_func_t unpack, int vec_size) { - for (int x = 0; x < n; x += vec_size) { - tform(input + x, output + x, n); - } - transpose(output, temp, n); - - for (int x = 0; x < n; x += vec_size) { - tform(temp + x, output + x, n); - } - transpose(output, temp, n); - - unpack(temp, output, n); -} - -static INLINE void store_float(float *output, float input) { *output = input; } -static INLINE float add_float(float a, float b) { return a + b; } -static INLINE float sub_float(float a, float b) { return a - b; } -static INLINE float mul_float(float a, float b) { return a * b; } - -GEN_FFT_2(void, float, float, float, *, store_float); -GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, - sub_float); -GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); -GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); -GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); - -void aom_fft2x2_float_c(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, - unpack_2d_output, 1); -} - -void aom_fft4x4_float_c(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, - unpack_2d_output, 1); -} - -void aom_fft8x8_float_c(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, - unpack_2d_output, 1); -} - -void aom_fft16x16_float_c(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, - unpack_2d_output, 1); -} - -void aom_fft32x32_float_c(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, - unpack_2d_output, 1); -} - -void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, - aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, - aom_fft_1d_func_t ifft_multi, - aom_fft_transpose_func_t transpose, int vec_size) { - // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft - // and get real outputs. - for (int y = 0; y <= n / 2; ++y) { - output[y * n] = input[2 * y * n]; - output[y * n + 1] = input[2 * (y * n + n / 2)]; - } - for (int y = n / 2 + 1; y < n; ++y) { - output[y * n] = input[2 * (y - n / 2) * n + 1]; - output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; - } - - for (int i = 0; i < 2; i += vec_size) { - ifft_multi(output + i, temp + i, n); - } - - // For the other columns, since we don't have a full ifft for complex inputs - // we have to split them into the real and imaginary counterparts. - // Pack the real component, then the imaginary components. - for (int y = 0; y < n; ++y) { - for (int x = 1; x < n / 2; ++x) { - output[y * n + (x + 1)] = input[2 * (y * n + x)]; - } - for (int x = 1; x < n / 2; ++x) { - output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; - } - } - for (int y = 2; y < vec_size; y++) { - fft_single(output + y, temp + y, n); - } - // This is the part that can be sped up with SIMD - for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { - fft_multi(output + y, temp + y, n); - } - - // Put the 0 and n/2 th results in the correct place. - for (int x = 0; x < n; ++x) { - output[x] = temp[x * n]; - output[(n / 2) * n + x] = temp[x * n + 1]; - } - // This rearranges and transposes. - for (int y = 1; y < n / 2; ++y) { - // Fill in the real columns - for (int x = 0; x <= n / 2; ++x) { - output[x + y * n] = - temp[(y + 1) + x * n] + - ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); - } - for (int x = n / 2 + 1; x < n; ++x) { - output[x + y * n] = temp[(y + 1) + (n - x) * n] - - temp[(y + n / 2) + ((n - x) + n / 2) * n]; - } - // Fill in the imag columns - for (int x = 0; x <= n / 2; ++x) { - output[x + (y + n / 2) * n] = - temp[(y + n / 2) + x * n] - - ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); - } - for (int x = n / 2 + 1; x < n; ++x) { - output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + - temp[(y + n / 2) + (n - x) * n]; - } - } - for (int y = 0; y < n; y += vec_size) { - ifft_multi(output + y, temp + y, n); - } - transpose(temp, output, n); -} - -GEN_IFFT_2(void, float, float, float, *, store_float); -GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, - sub_float); -GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); -GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); -GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, - sub_float, mul_float); - -void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, - aom_ifft1d_2_float, simple_transpose, 1); -} - -void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, - aom_ifft1d_4_float, simple_transpose, 1); -} - -void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, - aom_ifft1d_8_float, simple_transpose, 1); -} - -void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, - aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); -} - -void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, - aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); -} diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h deleted file mode 100644 index 5137331ae..000000000 --- a/third_party/aom/aom_dsp/fft_common.h +++ /dev/null @@ -1,1050 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_FFT_COMMON_H_ -#define AOM_AOM_DSP_FFT_COMMON_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -/*!\brief A function pointer for computing 1d fft and ifft. - * - * The function will point to an implementation for a specific transform size, - * and may perform the transforms using vectorized instructions. - * - * For a non-vectorized forward transforms of size n, the input and output - * buffers will be size n. The output takes advantage of conjugate symmetry and - * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where - * (r_{j}, i_{j}) is the complex output for index j. - * - * An inverse transform will assume that the complex "input" is packed - * similarly. Its output will be real. - * - * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. - * - * Vectorized implementations are parallelized along the columns so that the fft - * can be performed on multiple columns at a time. In such cases the data block - * for input and output is typically square (n x n) and the stride will - * correspond to the spacing between rows. At minimum, the input size must be - * n x simd_vector_length. - * - * \param[in] input Input buffer. See above for size restrictions. - * \param[out] output Output buffer. See above for size restrictions. - * \param[in] stride The spacing in number of elements between rows - * (or elements) - */ -typedef void (*aom_fft_1d_func_t)(const float *input, float *output, - int stride); - -// Declare some of the forward non-vectorized transforms which are used in some -// of the vectorized implementations -void aom_fft1d_4_float(const float *input, float *output, int stride); -void aom_fft1d_8_float(const float *input, float *output, int stride); -void aom_fft1d_16_float(const float *input, float *output, int stride); -void aom_fft1d_32_float(const float *input, float *output, int stride); - -/**\!brief Function pointer for transposing a matrix of floats. - * - * \param[in] input Input buffer (size n x n) - * \param[out] output Output buffer (size n x n) - * \param[in] n Extent of one dimension of the square matrix. - */ -typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, - int n); - -/**\!brief Function pointer for re-arranging intermediate 2d transform results. - * - * After re-arrangement, the real and imaginary components will be packed - * tightly next to each other. - * - * \param[in] input Input buffer (size n x n) - * \param[out] output Output buffer (size 2 x n x n) - * \param[in] n Extent of one dimension of the square matrix. - */ -typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); - -/*!\brief Performs a 2d fft with the given functions. - * - * This generator function allows for multiple different implementations of 2d - * fft with different vector operations, without having to redefine the main - * body multiple times. - * - * \param[in] input Input buffer to run the transform on (size n x n) - * \param[out] temp Working buffer for computing the transform (size n x n) - * \param[out] output Output buffer (size 2 x n x n) - * \param[in] tform Forward transform function - * \param[in] transpose Transpose function (for n x n matrix) - * \param[in] unpack Unpack function used to massage outputs to correct form - * \param[in] vec_size Vector size (the transform is done vec_size units at - * a time) - */ -void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, - aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, - aom_fft_unpack_func_t unpack, int vec_size); - -/*!\brief Perform a 2d inverse fft with the given helper functions - * - * \param[in] input Input buffer to run the transform on (size 2 x n x n) - * \param[out] temp Working buffer for computations (size 2 x n x n) - * \param[out] output Output buffer (size n x n) - * \param[in] fft_single Forward transform function (non vectorized) - * \param[in] fft_multi Forward transform function (vectorized) - * \param[in] ifft_multi Inverse transform function (vectorized) - * \param[in] transpose Transpose function (for n x n matrix) - * \param[in] vec_size Vector size (the transform is done vec_size - * units at a time) - */ -void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, - aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, - aom_fft_1d_func_t ifft_multi, - aom_fft_transpose_func_t transpose, int vec_size); -#ifdef __cplusplus -} -#endif - -// The macros below define 1D fft/ifft for different data types and for -// different simd vector intrinsic types. - -#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ - ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - store(output + 0 * stride, i0 + i1); \ - store(output + 1 * stride, i0 - i1); \ - } - -#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ - ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC w0 = add(i0, i2); \ - const T_VEC w1 = sub(i0, i2); \ - const T_VEC w2 = add(i1, i3); \ - const T_VEC w3 = sub(i1, i3); \ - store(output + 0 * stride, add(w0, w2)); \ - store(output + 1 * stride, w1); \ - store(output + 2 * stride, sub(w0, w2)); \ - store(output + 3 * stride, sub(kWeight0, w3)); \ - } - -#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ - ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC w0 = add(i0, i4); \ - const T_VEC w1 = sub(i0, i4); \ - const T_VEC w2 = add(i2, i6); \ - const T_VEC w3 = sub(i2, i6); \ - const T_VEC w4 = add(w0, w2); \ - const T_VEC w5 = sub(w0, w2); \ - const T_VEC w7 = add(i1, i5); \ - const T_VEC w8 = sub(i1, i5); \ - const T_VEC w9 = add(i3, i7); \ - const T_VEC w10 = sub(i3, i7); \ - const T_VEC w11 = add(w7, w9); \ - const T_VEC w12 = sub(w7, w9); \ - store(output + 0 * stride, add(w4, w11)); \ - store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ - store(output + 2 * stride, w5); \ - store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ - store(output + 4 * stride, sub(w4, w11)); \ - store(output + 5 * stride, \ - sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ - store(output + 6 * stride, sub(kWeight0, w12)); \ - store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ - } - -#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ - mul) \ - ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC kWeight3 = constant(0.92388f); \ - const T_VEC kWeight4 = constant(0.382683f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC i8 = load(input + 8 * stride); \ - const T_VEC i9 = load(input + 9 * stride); \ - const T_VEC i10 = load(input + 10 * stride); \ - const T_VEC i11 = load(input + 11 * stride); \ - const T_VEC i12 = load(input + 12 * stride); \ - const T_VEC i13 = load(input + 13 * stride); \ - const T_VEC i14 = load(input + 14 * stride); \ - const T_VEC i15 = load(input + 15 * stride); \ - const T_VEC w0 = add(i0, i8); \ - const T_VEC w1 = sub(i0, i8); \ - const T_VEC w2 = add(i4, i12); \ - const T_VEC w3 = sub(i4, i12); \ - const T_VEC w4 = add(w0, w2); \ - const T_VEC w5 = sub(w0, w2); \ - const T_VEC w7 = add(i2, i10); \ - const T_VEC w8 = sub(i2, i10); \ - const T_VEC w9 = add(i6, i14); \ - const T_VEC w10 = sub(i6, i14); \ - const T_VEC w11 = add(w7, w9); \ - const T_VEC w12 = sub(w7, w9); \ - const T_VEC w14 = add(w4, w11); \ - const T_VEC w15 = sub(w4, w11); \ - const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ - sub(sub(kWeight0, w3), \ - mul(kWeight2, add(w10, w8))) }; \ - const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ - sub(w3, mul(kWeight2, add(w10, w8))) }; \ - const T_VEC w19 = add(i1, i9); \ - const T_VEC w20 = sub(i1, i9); \ - const T_VEC w21 = add(i5, i13); \ - const T_VEC w22 = sub(i5, i13); \ - const T_VEC w23 = add(w19, w21); \ - const T_VEC w24 = sub(w19, w21); \ - const T_VEC w26 = add(i3, i11); \ - const T_VEC w27 = sub(i3, i11); \ - const T_VEC w28 = add(i7, i15); \ - const T_VEC w29 = sub(i7, i15); \ - const T_VEC w30 = add(w26, w28); \ - const T_VEC w31 = sub(w26, w28); \ - const T_VEC w33 = add(w23, w30); \ - const T_VEC w34 = sub(w23, w30); \ - const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ - sub(sub(kWeight0, w22), \ - mul(kWeight2, add(w29, w27))) }; \ - const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ - sub(w22, mul(kWeight2, add(w29, w27))) }; \ - store(output + 0 * stride, add(w14, w33)); \ - store(output + 1 * stride, \ - add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ - store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ - store(output + 3 * stride, \ - add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ - store(output + 4 * stride, w15); \ - store(output + 5 * stride, \ - add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ - mul(kWeight3, w37[1])))); \ - store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ - store(output + 7 * stride, \ - add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ - mul(kWeight4, w35[1])))); \ - store(output + 8 * stride, sub(w14, w33)); \ - store(output + 9 * stride, \ - add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ - store(output + 10 * stride, \ - sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ - store(output + 11 * stride, \ - add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ - store(output + 12 * stride, sub(kWeight0, w34)); \ - store(output + 13 * stride, \ - sub(sub(kWeight0, w18[1]), \ - sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ - store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ - store(output + 15 * stride, \ - sub(sub(kWeight0, w16[1]), \ - sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ - } - -#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ - mul) \ - ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC kWeight3 = constant(0.92388f); \ - const T_VEC kWeight4 = constant(0.382683f); \ - const T_VEC kWeight5 = constant(0.980785f); \ - const T_VEC kWeight6 = constant(0.19509f); \ - const T_VEC kWeight7 = constant(0.83147f); \ - const T_VEC kWeight8 = constant(0.55557f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC i8 = load(input + 8 * stride); \ - const T_VEC i9 = load(input + 9 * stride); \ - const T_VEC i10 = load(input + 10 * stride); \ - const T_VEC i11 = load(input + 11 * stride); \ - const T_VEC i12 = load(input + 12 * stride); \ - const T_VEC i13 = load(input + 13 * stride); \ - const T_VEC i14 = load(input + 14 * stride); \ - const T_VEC i15 = load(input + 15 * stride); \ - const T_VEC i16 = load(input + 16 * stride); \ - const T_VEC i17 = load(input + 17 * stride); \ - const T_VEC i18 = load(input + 18 * stride); \ - const T_VEC i19 = load(input + 19 * stride); \ - const T_VEC i20 = load(input + 20 * stride); \ - const T_VEC i21 = load(input + 21 * stride); \ - const T_VEC i22 = load(input + 22 * stride); \ - const T_VEC i23 = load(input + 23 * stride); \ - const T_VEC i24 = load(input + 24 * stride); \ - const T_VEC i25 = load(input + 25 * stride); \ - const T_VEC i26 = load(input + 26 * stride); \ - const T_VEC i27 = load(input + 27 * stride); \ - const T_VEC i28 = load(input + 28 * stride); \ - const T_VEC i29 = load(input + 29 * stride); \ - const T_VEC i30 = load(input + 30 * stride); \ - const T_VEC i31 = load(input + 31 * stride); \ - const T_VEC w0 = add(i0, i16); \ - const T_VEC w1 = sub(i0, i16); \ - const T_VEC w2 = add(i8, i24); \ - const T_VEC w3 = sub(i8, i24); \ - const T_VEC w4 = add(w0, w2); \ - const T_VEC w5 = sub(w0, w2); \ - const T_VEC w7 = add(i4, i20); \ - const T_VEC w8 = sub(i4, i20); \ - const T_VEC w9 = add(i12, i28); \ - const T_VEC w10 = sub(i12, i28); \ - const T_VEC w11 = add(w7, w9); \ - const T_VEC w12 = sub(w7, w9); \ - const T_VEC w14 = add(w4, w11); \ - const T_VEC w15 = sub(w4, w11); \ - const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ - sub(sub(kWeight0, w3), \ - mul(kWeight2, add(w10, w8))) }; \ - const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ - sub(w3, mul(kWeight2, add(w10, w8))) }; \ - const T_VEC w19 = add(i2, i18); \ - const T_VEC w20 = sub(i2, i18); \ - const T_VEC w21 = add(i10, i26); \ - const T_VEC w22 = sub(i10, i26); \ - const T_VEC w23 = add(w19, w21); \ - const T_VEC w24 = sub(w19, w21); \ - const T_VEC w26 = add(i6, i22); \ - const T_VEC w27 = sub(i6, i22); \ - const T_VEC w28 = add(i14, i30); \ - const T_VEC w29 = sub(i14, i30); \ - const T_VEC w30 = add(w26, w28); \ - const T_VEC w31 = sub(w26, w28); \ - const T_VEC w33 = add(w23, w30); \ - const T_VEC w34 = sub(w23, w30); \ - const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ - sub(sub(kWeight0, w22), \ - mul(kWeight2, add(w29, w27))) }; \ - const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ - sub(w22, mul(kWeight2, add(w29, w27))) }; \ - const T_VEC w38 = add(w14, w33); \ - const T_VEC w39 = sub(w14, w33); \ - const T_VEC w40[2] = { \ - add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ - add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ - }; \ - const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ - sub(sub(kWeight0, w12), \ - mul(kWeight2, add(w31, w24))) }; \ - const T_VEC w42[2] = { \ - add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ - add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ - }; \ - const T_VEC w44[2] = { \ - add(w18[0], \ - sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ - sub(sub(kWeight0, w18[1]), \ - sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ - }; \ - const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ - sub(w12, mul(kWeight2, add(w31, w24))) }; \ - const T_VEC w46[2] = { \ - add(w16[0], \ - sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ - sub(sub(kWeight0, w16[1]), \ - sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ - }; \ - const T_VEC w47 = add(i1, i17); \ - const T_VEC w48 = sub(i1, i17); \ - const T_VEC w49 = add(i9, i25); \ - const T_VEC w50 = sub(i9, i25); \ - const T_VEC w51 = add(w47, w49); \ - const T_VEC w52 = sub(w47, w49); \ - const T_VEC w54 = add(i5, i21); \ - const T_VEC w55 = sub(i5, i21); \ - const T_VEC w56 = add(i13, i29); \ - const T_VEC w57 = sub(i13, i29); \ - const T_VEC w58 = add(w54, w56); \ - const T_VEC w59 = sub(w54, w56); \ - const T_VEC w61 = add(w51, w58); \ - const T_VEC w62 = sub(w51, w58); \ - const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ - sub(sub(kWeight0, w50), \ - mul(kWeight2, add(w57, w55))) }; \ - const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ - sub(w50, mul(kWeight2, add(w57, w55))) }; \ - const T_VEC w66 = add(i3, i19); \ - const T_VEC w67 = sub(i3, i19); \ - const T_VEC w68 = add(i11, i27); \ - const T_VEC w69 = sub(i11, i27); \ - const T_VEC w70 = add(w66, w68); \ - const T_VEC w71 = sub(w66, w68); \ - const T_VEC w73 = add(i7, i23); \ - const T_VEC w74 = sub(i7, i23); \ - const T_VEC w75 = add(i15, i31); \ - const T_VEC w76 = sub(i15, i31); \ - const T_VEC w77 = add(w73, w75); \ - const T_VEC w78 = sub(w73, w75); \ - const T_VEC w80 = add(w70, w77); \ - const T_VEC w81 = sub(w70, w77); \ - const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ - sub(sub(kWeight0, w69), \ - mul(kWeight2, add(w76, w74))) }; \ - const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ - sub(w69, mul(kWeight2, add(w76, w74))) }; \ - const T_VEC w85 = add(w61, w80); \ - const T_VEC w86 = sub(w61, w80); \ - const T_VEC w87[2] = { \ - add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ - add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ - }; \ - const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ - sub(sub(kWeight0, w59), \ - mul(kWeight2, add(w78, w71))) }; \ - const T_VEC w89[2] = { \ - add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ - add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ - }; \ - const T_VEC w91[2] = { \ - add(w65[0], \ - sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ - sub(sub(kWeight0, w65[1]), \ - sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ - }; \ - const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ - sub(w59, mul(kWeight2, add(w78, w71))) }; \ - const T_VEC w93[2] = { \ - add(w63[0], \ - sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ - sub(sub(kWeight0, w63[1]), \ - sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ - }; \ - store(output + 0 * stride, add(w38, w85)); \ - store(output + 1 * stride, \ - add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ - store(output + 2 * stride, \ - add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ - store(output + 3 * stride, \ - add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ - store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ - store(output + 5 * stride, \ - add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ - store(output + 6 * stride, \ - add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ - store(output + 7 * stride, \ - add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ - store(output + 8 * stride, w39); \ - store(output + 9 * stride, \ - add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ - mul(kWeight5, w93[1])))); \ - store(output + 10 * stride, \ - add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ - mul(kWeight3, w92[1])))); \ - store(output + 11 * stride, \ - add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ - mul(kWeight7, w91[1])))); \ - store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ - store(output + 13 * stride, \ - add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ - mul(kWeight8, w89[1])))); \ - store(output + 14 * stride, \ - add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ - mul(kWeight4, w88[1])))); \ - store(output + 15 * stride, \ - add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ - mul(kWeight6, w87[1])))); \ - store(output + 16 * stride, sub(w38, w85)); \ - store(output + 17 * stride, \ - add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ - store(output + 18 * stride, \ - add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ - store(output + 19 * stride, \ - add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ - store(output + 20 * stride, \ - sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ - store(output + 21 * stride, \ - add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ - store(output + 22 * stride, \ - add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ - store(output + 23 * stride, \ - add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ - store(output + 24 * stride, sub(kWeight0, w86)); \ - store(output + 25 * stride, \ - sub(sub(kWeight0, w46[1]), \ - sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ - store(output + 26 * stride, \ - sub(sub(kWeight0, w45[1]), \ - sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ - store(output + 27 * stride, \ - sub(sub(kWeight0, w44[1]), \ - sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ - store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ - store(output + 29 * stride, \ - sub(sub(kWeight0, w42[1]), \ - sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ - store(output + 30 * stride, \ - sub(sub(kWeight0, w41[1]), \ - sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ - store(output + 31 * stride, \ - sub(sub(kWeight0, w40[1]), \ - sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ - } - -#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ - ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - store(output + 0 * stride, i0 + i1); \ - store(output + 1 * stride, i0 - i1); \ - } - -#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ - ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC w2 = add(i0, i2); \ - const T_VEC w3 = sub(i0, i2); \ - const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ - const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ - store(output + 0 * stride, add(w2, w4[0])); \ - store(output + 1 * stride, add(w3, w5[1])); \ - store(output + 2 * stride, sub(w2, w4[0])); \ - store(output + 3 * stride, sub(w3, w5[1])); \ - } - -#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ - mul) \ - ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC w6 = add(i0, i4); \ - const T_VEC w7 = sub(i0, i4); \ - const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ - const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ - const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ - const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ - const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ - const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ - const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ - const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ - const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ - const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ - const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ - const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ - const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ - const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ - store(output + 0 * stride, add(w10[0], w18[0])); \ - store(output + 1 * stride, \ - add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ - store(output + 2 * stride, add(w11[0], w19[1])); \ - store(output + 3 * stride, \ - sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ - store(output + 4 * stride, sub(w10[0], w18[0])); \ - store(output + 5 * stride, \ - add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ - mul(kWeight2, w20[1])))); \ - store(output + 6 * stride, sub(w11[0], w19[1])); \ - store(output + 7 * stride, \ - add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ - } - -#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ - mul) \ - ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC kWeight3 = constant(0.92388f); \ - const T_VEC kWeight4 = constant(0.382683f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC i8 = load(input + 8 * stride); \ - const T_VEC i9 = load(input + 9 * stride); \ - const T_VEC i10 = load(input + 10 * stride); \ - const T_VEC i11 = load(input + 11 * stride); \ - const T_VEC i12 = load(input + 12 * stride); \ - const T_VEC i13 = load(input + 13 * stride); \ - const T_VEC i14 = load(input + 14 * stride); \ - const T_VEC i15 = load(input + 15 * stride); \ - const T_VEC w14 = add(i0, i8); \ - const T_VEC w15 = sub(i0, i8); \ - const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ - const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ - const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ - const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ - const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ - const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ - const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ - const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ - const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ - const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ - const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ - const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ - const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ - const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ - const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ - const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ - const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ - add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ - const T_VEC w33[2] = { add(w20[0], \ - sub(sub(kWeight0, mul(kWeight2, w28[0])), \ - mul(kWeight2, w28[1]))), \ - add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ - const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ - const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ - const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ - sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ - const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ - add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ - const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ - const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ - const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ - const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ - const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ - const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ - const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ - const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ - const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ - const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ - const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ - const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ - const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ - const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ - const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ - const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ - const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ - const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ - const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ - add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ - const T_VEC w57[2] = { add(w44[0], \ - sub(sub(kWeight0, mul(kWeight2, w52[0])), \ - mul(kWeight2, w52[1]))), \ - add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ - const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ - const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ - const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ - sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ - const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ - add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ - store(output + 0 * stride, add(w30[0], w54[0])); \ - store(output + 1 * stride, \ - add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ - store(output + 2 * stride, \ - add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ - store(output + 3 * stride, \ - add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ - store(output + 4 * stride, add(w31[0], w55[1])); \ - store(output + 5 * stride, \ - sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ - store(output + 6 * stride, \ - sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ - store(output + 7 * stride, \ - sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ - store(output + 8 * stride, sub(w30[0], w54[0])); \ - store(output + 9 * stride, \ - add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ - mul(kWeight4, w56[1])))); \ - store(output + 10 * stride, \ - add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ - mul(kWeight2, w58[1])))); \ - store(output + 11 * stride, \ - add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ - mul(kWeight3, w60[1])))); \ - store(output + 12 * stride, sub(w31[0], w55[1])); \ - store(output + 13 * stride, \ - add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ - store(output + 14 * stride, \ - add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ - store(output + 15 * stride, \ - add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ - } -#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ - mul) \ - ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ - const T_VEC kWeight0 = constant(0.0f); \ - const T_VEC kWeight2 = constant(0.707107f); \ - const T_VEC kWeight3 = constant(0.92388f); \ - const T_VEC kWeight4 = constant(0.382683f); \ - const T_VEC kWeight5 = constant(0.980785f); \ - const T_VEC kWeight6 = constant(0.19509f); \ - const T_VEC kWeight7 = constant(0.83147f); \ - const T_VEC kWeight8 = constant(0.55557f); \ - const T_VEC i0 = load(input + 0 * stride); \ - const T_VEC i1 = load(input + 1 * stride); \ - const T_VEC i2 = load(input + 2 * stride); \ - const T_VEC i3 = load(input + 3 * stride); \ - const T_VEC i4 = load(input + 4 * stride); \ - const T_VEC i5 = load(input + 5 * stride); \ - const T_VEC i6 = load(input + 6 * stride); \ - const T_VEC i7 = load(input + 7 * stride); \ - const T_VEC i8 = load(input + 8 * stride); \ - const T_VEC i9 = load(input + 9 * stride); \ - const T_VEC i10 = load(input + 10 * stride); \ - const T_VEC i11 = load(input + 11 * stride); \ - const T_VEC i12 = load(input + 12 * stride); \ - const T_VEC i13 = load(input + 13 * stride); \ - const T_VEC i14 = load(input + 14 * stride); \ - const T_VEC i15 = load(input + 15 * stride); \ - const T_VEC i16 = load(input + 16 * stride); \ - const T_VEC i17 = load(input + 17 * stride); \ - const T_VEC i18 = load(input + 18 * stride); \ - const T_VEC i19 = load(input + 19 * stride); \ - const T_VEC i20 = load(input + 20 * stride); \ - const T_VEC i21 = load(input + 21 * stride); \ - const T_VEC i22 = load(input + 22 * stride); \ - const T_VEC i23 = load(input + 23 * stride); \ - const T_VEC i24 = load(input + 24 * stride); \ - const T_VEC i25 = load(input + 25 * stride); \ - const T_VEC i26 = load(input + 26 * stride); \ - const T_VEC i27 = load(input + 27 * stride); \ - const T_VEC i28 = load(input + 28 * stride); \ - const T_VEC i29 = load(input + 29 * stride); \ - const T_VEC i30 = load(input + 30 * stride); \ - const T_VEC i31 = load(input + 31 * stride); \ - const T_VEC w30 = add(i0, i16); \ - const T_VEC w31 = sub(i0, i16); \ - const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ - const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ - const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ - const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ - const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ - const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ - const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ - const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ - const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ - const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ - const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ - const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ - const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ - const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ - const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ - const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ - const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ - add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ - const T_VEC w49[2] = { add(w36[0], \ - sub(sub(kWeight0, mul(kWeight2, w44[0])), \ - mul(kWeight2, w44[1]))), \ - add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ - const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ - const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ - const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ - sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ - const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ - add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ - const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ - const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ - const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ - const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ - const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ - const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ - const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ - const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ - const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ - const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ - const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ - const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ - const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ - const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ - const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ - const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ - const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ - const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ - const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ - add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ - const T_VEC w73[2] = { add(w60[0], \ - sub(sub(kWeight0, mul(kWeight2, w68[0])), \ - mul(kWeight2, w68[1]))), \ - add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ - const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ - const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ - const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ - sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ - const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ - add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ - const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ - const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ - const T_VEC w80[2] = { \ - add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ - add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ - }; \ - const T_VEC w81[2] = { \ - add(w48[0], \ - sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ - add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ - }; \ - const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ - add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ - const T_VEC w83[2] = { add(w50[0], \ - sub(sub(kWeight0, mul(kWeight2, w74[0])), \ - mul(kWeight2, w74[1]))), \ - add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ - const T_VEC w84[2] = { \ - add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ - add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ - }; \ - const T_VEC w85[2] = { \ - add(w52[0], \ - sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ - add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ - }; \ - const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ - const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ - const T_VEC w88[2] = { \ - sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ - add(w49[1], \ - sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ - }; \ - const T_VEC w89[2] = { \ - add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ - add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ - }; \ - const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ - sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ - const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ - add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ - const T_VEC w92[2] = { \ - sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ - add(w53[1], \ - sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ - }; \ - const T_VEC w93[2] = { \ - add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ - add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ - }; \ - const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ - const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ - const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ - const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ - const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ - const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ - const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ - const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ - const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ - const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ - const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ - const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ - const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ - const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ - const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ - const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ - const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ - const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ - const T_VEC w112[2] = { \ - add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ - add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ - }; \ - const T_VEC w113[2] = { \ - add(w100[0], \ - sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ - add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ - }; \ - const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ - const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ - const T_VEC w116[2] = { \ - sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ - sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ - }; \ - const T_VEC w117[2] = { \ - add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ - add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ - }; \ - const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ - const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ - const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ - const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ - const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ - const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ - const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ - const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ - const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ - const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ - const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ - const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ - const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ - const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ - const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ - const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ - const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ - const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ - const T_VEC w136[2] = { \ - add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ - add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ - }; \ - const T_VEC w137[2] = { \ - add(w124[0], \ - sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ - add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ - }; \ - const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ - const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ - const T_VEC w140[2] = { \ - sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ - sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ - }; \ - const T_VEC w141[2] = { \ - add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ - add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ - }; \ - const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ - const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ - const T_VEC w144[2] = { \ - add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ - add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ - }; \ - const T_VEC w145[2] = { \ - add(w112[0], \ - sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ - add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ - }; \ - const T_VEC w146[2] = { \ - add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ - add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ - }; \ - const T_VEC w147[2] = { \ - add(w114[0], \ - sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ - add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ - }; \ - const T_VEC w148[2] = { \ - add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ - add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ - }; \ - const T_VEC w149[2] = { \ - add(w116[0], \ - sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ - add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ - }; \ - const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ - const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ - const T_VEC w152[2] = { \ - sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ - add(w113[1], \ - sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ - }; \ - const T_VEC w153[2] = { \ - add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ - add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ - }; \ - const T_VEC w154[2] = { \ - sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ - sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ - }; \ - const T_VEC w155[2] = { \ - add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ - add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ - }; \ - const T_VEC w156[2] = { \ - sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ - add(w117[1], \ - sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ - }; \ - const T_VEC w157[2] = { \ - add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ - add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ - }; \ - store(output + 0 * stride, add(w78[0], w142[0])); \ - store(output + 1 * stride, \ - add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ - store(output + 2 * stride, \ - add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ - store(output + 3 * stride, \ - add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ - store(output + 4 * stride, \ - add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ - store(output + 5 * stride, \ - add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ - store(output + 6 * stride, \ - add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ - store(output + 7 * stride, \ - add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ - store(output + 8 * stride, add(w79[0], w143[1])); \ - store(output + 9 * stride, \ - sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ - store(output + 10 * stride, \ - sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ - store(output + 11 * stride, \ - sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ - store(output + 12 * stride, \ - sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ - store(output + 13 * stride, \ - sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ - store(output + 14 * stride, \ - sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ - store(output + 15 * stride, \ - sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ - store(output + 16 * stride, sub(w78[0], w142[0])); \ - store(output + 17 * stride, \ - add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ - mul(kWeight6, w144[1])))); \ - store(output + 18 * stride, \ - add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ - mul(kWeight4, w146[1])))); \ - store(output + 19 * stride, \ - add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ - mul(kWeight8, w148[1])))); \ - store(output + 20 * stride, \ - add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ - mul(kWeight2, w150[1])))); \ - store(output + 21 * stride, \ - add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ - mul(kWeight7, w152[1])))); \ - store(output + 22 * stride, \ - add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ - mul(kWeight3, w154[1])))); \ - store(output + 23 * stride, \ - add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ - mul(kWeight5, w156[1])))); \ - store(output + 24 * stride, sub(w79[0], w143[1])); \ - store(output + 25 * stride, \ - add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ - store(output + 26 * stride, \ - add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ - store(output + 27 * stride, \ - add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ - store(output + 28 * stride, \ - add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ - store(output + 29 * stride, \ - add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ - store(output + 30 * stride, \ - add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ - store(output + 31 * stride, \ - add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ - } - -#endif // AOM_AOM_DSP_FFT_COMMON_H_ diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c deleted file mode 100644 index e50f951c1..000000000 --- a/third_party/aom/aom_dsp/fwd_txfm.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include "aom_dsp/txfm_common.h" -#include "config/aom_dsp_rtcd.h" - -void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { - int i, j; - tran_low_t intermediate[64]; - int pass; - tran_low_t *output = intermediate; - const tran_low_t *in = NULL; - - // Transform columns - for (pass = 0; pass < 2; ++pass) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - for (i = 0; i < 8; i++) { - // stage 1 - if (pass == 0) { - s0 = (input[0 * stride] + input[7 * stride]) * 4; - s1 = (input[1 * stride] + input[6 * stride]) * 4; - s2 = (input[2 * stride] + input[5 * stride]) * 4; - s3 = (input[3 * stride] + input[4 * stride]) * 4; - s4 = (input[3 * stride] - input[4 * stride]) * 4; - s5 = (input[2 * stride] - input[5 * stride]) * 4; - s6 = (input[1 * stride] - input[6 * stride]) * 4; - s7 = (input[0 * stride] - input[7 * stride]) * 4; - ++input; - } else { - s0 = in[0 * 8] + in[7 * 8]; - s1 = in[1 * 8] + in[6 * 8]; - s2 = in[2 * 8] + in[5 * 8]; - s3 = in[3 * 8] + in[4 * 8]; - s4 = in[3 * 8] - in[4 * 8]; - s5 = in[2 * 8] - in[5 * 8]; - s6 = in[1 * 8] - in[6 * 8]; - s7 = in[0 * 8] - in[7 * 8]; - ++in; - } - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0] = (tran_low_t)fdct_round_shift(t0); - output[2] = (tran_low_t)fdct_round_shift(t2); - output[4] = (tran_low_t)fdct_round_shift(t1); - output[6] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1] = (tran_low_t)fdct_round_shift(t0); - output[3] = (tran_low_t)fdct_round_shift(t2); - output[5] = (tran_low_t)fdct_round_shift(t1); - output[7] = (tran_low_t)fdct_round_shift(t3); - output += 8; - } - in = intermediate; - output = final_output; - } - - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; - } -} - -void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, - int stride) { - aom_fdct8x8_c(input, final_output, stride); -} diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c deleted file mode 100644 index b96e1c319..000000000 --- a/third_party/aom/aom_dsp/grain_synthesis.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/*!\file - * \brief Describes film grain parameters and film grain synthesis - * - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <assert.h> -#include "aom_dsp/grain_synthesis.h" -#include "aom_mem/aom_mem.h" - -// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits) -// with zero mean and standard deviation of about 512. -// should be divided by 4 for 10-bit range and 16 for 8-bit range. -static const int gaussian_sequence[2048] = { - 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, - 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, - 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, - -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, - 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, - 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, - 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, - 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, - 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, - 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, - 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, - -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, - 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, - 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, - -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, - -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, - -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, - -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, - 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, - 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, - 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, - -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, - -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, - -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, - 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, - 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, - 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, - -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, - 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, - -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, - 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, - -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, - 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, - -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, - -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, - -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, - -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, - -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, - 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, - 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, - -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, - -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, - 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, - 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, - -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, - 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, - 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, - -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, - 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, - -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, - 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, - -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, - -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, - 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, - -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, - -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, - 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, - 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, - -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, - 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, - 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, - 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, - -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, - -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, - -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, - 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, - -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, - -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, - -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, - -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, - -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, - 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, - -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, - -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, - 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, - -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, - -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, - -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, - 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, - -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, - 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, - 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, - 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, - -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, - -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, - 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, - 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, - -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, - -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, - -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, - -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, - 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, - 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, - 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, - 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, - 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, - 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, - 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, - -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, - 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, - -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, - -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, - -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, - 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, - -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, - -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, - 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, - 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, - 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, - 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, - 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, - 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, - 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, - -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, - -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, - -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, - 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, - -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, - -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, - 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, - -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, - 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, - 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, - 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, - -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, - 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, - -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, - 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, - 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, - 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, - 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, - -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, - -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, - 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, - -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, - 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, - 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, - 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, - -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, - -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, - 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, - 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, - 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, - -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, - -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, - 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, - -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, - -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, - -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, - 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, - -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, - 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, - -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, - 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, - -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, - 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, - 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, - 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, - 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, - -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, - -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, - -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, - -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, - 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, - 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, - 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, - 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, - -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, - 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, - -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, - 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, - 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, - -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, - -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, - -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, - -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, - 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, - -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, - -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, - -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, - -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, - 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, - 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, - -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, - -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, - 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, - 428, -484 -}; - -static const int gauss_bits = 11; - -static int luma_subblock_size_y = 32; -static int luma_subblock_size_x = 32; - -static int chroma_subblock_size_y = 16; -static int chroma_subblock_size_x = 16; - -static const int min_luma_legal_range = 16; -static const int max_luma_legal_range = 235; - -static const int min_chroma_legal_range = 16; -static const int max_chroma_legal_range = 240; - -static int scaling_lut_y[256]; -static int scaling_lut_cb[256]; -static int scaling_lut_cr[256]; - -static int grain_center; -static int grain_min; -static int grain_max; - -static uint16_t random_register = 0; // random number generator register - -static void init_arrays(const aom_film_grain_t *params, int luma_stride, - int chroma_stride, int ***pred_pos_luma_p, - int ***pred_pos_chroma_p, int **luma_grain_block, - int **cb_grain_block, int **cr_grain_block, - int **y_line_buf, int **cb_line_buf, int **cr_line_buf, - int **y_col_buf, int **cb_col_buf, int **cr_col_buf, - int luma_grain_samples, int chroma_grain_samples, - int chroma_subsamp_y, int chroma_subsamp_x) { - memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256); - memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256); - memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256); - - int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); - int num_pos_chroma = num_pos_luma; - if (params->num_y_points > 0) ++num_pos_chroma; - - int **pred_pos_luma; - int **pred_pos_chroma; - - pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma); - - for (int row = 0; row < num_pos_luma; row++) { - pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3); - } - - pred_pos_chroma = - (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma); - - for (int row = 0; row < num_pos_chroma; row++) { - pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3); - } - - int pos_ar_index = 0; - - for (int row = -params->ar_coeff_lag; row < 0; row++) { - for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1; - col++) { - pred_pos_luma[pos_ar_index][0] = row; - pred_pos_luma[pos_ar_index][1] = col; - pred_pos_luma[pos_ar_index][2] = 0; - - pred_pos_chroma[pos_ar_index][0] = row; - pred_pos_chroma[pos_ar_index][1] = col; - pred_pos_chroma[pos_ar_index][2] = 0; - ++pos_ar_index; - } - } - - for (int col = -params->ar_coeff_lag; col < 0; col++) { - pred_pos_luma[pos_ar_index][0] = 0; - pred_pos_luma[pos_ar_index][1] = col; - pred_pos_luma[pos_ar_index][2] = 0; - - pred_pos_chroma[pos_ar_index][0] = 0; - pred_pos_chroma[pos_ar_index][1] = col; - pred_pos_chroma[pos_ar_index][2] = 0; - - ++pos_ar_index; - } - - if (params->num_y_points > 0) { - pred_pos_chroma[pos_ar_index][0] = 0; - pred_pos_chroma[pos_ar_index][1] = 0; - pred_pos_chroma[pos_ar_index][2] = 1; - } - - *pred_pos_luma_p = pred_pos_luma; - *pred_pos_chroma_p = pred_pos_chroma; - - *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2); - *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride * - (2 >> chroma_subsamp_y)); - *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride * - (2 >> chroma_subsamp_y)); - - *y_col_buf = - (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2); - *cb_col_buf = - (int *)aom_malloc(sizeof(**cb_col_buf) * - (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * - (2 >> chroma_subsamp_x)); - *cr_col_buf = - (int *)aom_malloc(sizeof(**cr_col_buf) * - (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * - (2 >> chroma_subsamp_x)); - - *luma_grain_block = - (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples); - *cb_grain_block = - (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples); - *cr_grain_block = - (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); -} - -static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, - int ***pred_pos_chroma, int **luma_grain_block, - int **cb_grain_block, int **cr_grain_block, - int **y_line_buf, int **cb_line_buf, - int **cr_line_buf, int **y_col_buf, int **cb_col_buf, - int **cr_col_buf) { - int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); - int num_pos_chroma = num_pos_luma; - if (params->num_y_points > 0) ++num_pos_chroma; - - for (int row = 0; row < num_pos_luma; row++) { - aom_free((*pred_pos_luma)[row]); - } - aom_free(*pred_pos_luma); - - for (int row = 0; row < num_pos_chroma; row++) { - aom_free((*pred_pos_chroma)[row]); - } - aom_free((*pred_pos_chroma)); - - aom_free(*y_line_buf); - - aom_free(*cb_line_buf); - - aom_free(*cr_line_buf); - - aom_free(*y_col_buf); - - aom_free(*cb_col_buf); - - aom_free(*cr_col_buf); - - aom_free(*luma_grain_block); - - aom_free(*cb_grain_block); - - aom_free(*cr_grain_block); -} - -// get a number between 0 and 2^bits - 1 -static INLINE int get_random_number(int bits) { - uint16_t bit; - bit = ((random_register >> 0) ^ (random_register >> 1) ^ - (random_register >> 3) ^ (random_register >> 12)) & - 1; - random_register = (random_register >> 1) | (bit << 15); - return (random_register >> (16 - bits)) & ((1 << bits) - 1); -} - -static void init_random_generator(int luma_line, uint16_t seed) { - // same for the picture - - uint16_t msb = (seed >> 8) & 255; - uint16_t lsb = seed & 255; - - random_register = (msb << 8) + lsb; - - // changes for each row - int luma_num = luma_line >> 5; - - random_register ^= ((luma_num * 37 + 178) & 255) << 8; - random_register ^= ((luma_num * 173 + 105) & 255); -} - -// Return 0 for success, -1 for failure -static int generate_luma_grain_block( - const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, - int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, - int left_pad, int top_pad, int right_pad, int bottom_pad) { - if (params->num_y_points == 0) { - memset(luma_grain_block, 0, - sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); - return 0; - } - - int bit_depth = params->bit_depth; - int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; - - int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); - int rounding_offset = (1 << (params->ar_coeff_shift - 1)); - - for (int i = 0; i < luma_block_size_y; i++) - for (int j = 0; j < luma_block_size_x; j++) - luma_grain_block[i * luma_grain_stride + j] = - (gaussian_sequence[get_random_number(gauss_bits)] + - ((1 << gauss_sec_shift) >> 1)) >> - gauss_sec_shift; - - for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++) - for (int j = left_pad; j < luma_block_size_x - right_pad; j++) { - int wsum = 0; - for (int pos = 0; pos < num_pos_luma; pos++) { - wsum = wsum + params->ar_coeffs_y[pos] * - luma_grain_block[(i + pred_pos_luma[pos][0]) * - luma_grain_stride + - j + pred_pos_luma[pos][1]]; - } - luma_grain_block[i * luma_grain_stride + j] = - clamp(luma_grain_block[i * luma_grain_stride + j] + - ((wsum + rounding_offset) >> params->ar_coeff_shift), - grain_min, grain_max); - } - return 0; -} - -// Return 0 for success, -1 for failure -static int generate_chroma_grain_blocks( - const aom_film_grain_t *params, - // int** pred_pos_luma, - int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, - int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, - int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad, - int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) { - int bit_depth = params->bit_depth; - int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; - - int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); - if (params->num_y_points > 0) ++num_pos_chroma; - int rounding_offset = (1 << (params->ar_coeff_shift - 1)); - int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; - - if (params->num_cb_points || params->chroma_scaling_from_luma) { - init_random_generator(7 << 5, params->random_seed); - - for (int i = 0; i < chroma_block_size_y; i++) - for (int j = 0; j < chroma_block_size_x; j++) - cb_grain_block[i * chroma_grain_stride + j] = - (gaussian_sequence[get_random_number(gauss_bits)] + - ((1 << gauss_sec_shift) >> 1)) >> - gauss_sec_shift; - } else { - memset(cb_grain_block, 0, - sizeof(*cb_grain_block) * chroma_grain_block_size); - } - - if (params->num_cr_points || params->chroma_scaling_from_luma) { - init_random_generator(11 << 5, params->random_seed); - - for (int i = 0; i < chroma_block_size_y; i++) - for (int j = 0; j < chroma_block_size_x; j++) - cr_grain_block[i * chroma_grain_stride + j] = - (gaussian_sequence[get_random_number(gauss_bits)] + - ((1 << gauss_sec_shift) >> 1)) >> - gauss_sec_shift; - } else { - memset(cr_grain_block, 0, - sizeof(*cr_grain_block) * chroma_grain_block_size); - } - - for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) - for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) { - int wsum_cb = 0; - int wsum_cr = 0; - for (int pos = 0; pos < num_pos_chroma; pos++) { - if (pred_pos_chroma[pos][2] == 0) { - wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * - cb_grain_block[(i + pred_pos_chroma[pos][0]) * - chroma_grain_stride + - j + pred_pos_chroma[pos][1]]; - wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * - cr_grain_block[(i + pred_pos_chroma[pos][0]) * - chroma_grain_stride + - j + pred_pos_chroma[pos][1]]; - } else if (pred_pos_chroma[pos][2] == 1) { - int av_luma = 0; - int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad; - int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad; - - for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1; - k++) - for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1; - l++) - av_luma += luma_grain_block[k * luma_grain_stride + l]; - - av_luma = - (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >> - (chroma_subsamp_y + chroma_subsamp_x); - - wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; - wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; - } else { - fprintf( - stderr, - "Grain synthesis: prediction between two chroma components is " - "not supported!"); - return -1; - } - } - if (params->num_cb_points || params->chroma_scaling_from_luma) - cb_grain_block[i * chroma_grain_stride + j] = - clamp(cb_grain_block[i * chroma_grain_stride + j] + - ((wsum_cb + rounding_offset) >> params->ar_coeff_shift), - grain_min, grain_max); - if (params->num_cr_points || params->chroma_scaling_from_luma) - cr_grain_block[i * chroma_grain_stride + j] = - clamp(cr_grain_block[i * chroma_grain_stride + j] + - ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), - grain_min, grain_max); - } - return 0; -} - -static void init_scaling_function(const int scaling_points[][2], int num_points, - int scaling_lut[]) { - if (num_points == 0) return; - - for (int i = 0; i < scaling_points[0][0]; i++) - scaling_lut[i] = scaling_points[0][1]; - - for (int point = 0; point < num_points - 1; point++) { - int delta_y = scaling_points[point + 1][1] - scaling_points[point][1]; - int delta_x = scaling_points[point + 1][0] - scaling_points[point][0]; - - int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); - - for (int x = 0; x < delta_x; x++) { - scaling_lut[scaling_points[point][0] + x] = - scaling_points[point][1] + (int)((x * delta + 32768) >> 16); - } - } - - for (int i = scaling_points[num_points - 1][0]; i < 256; i++) - scaling_lut[i] = scaling_points[num_points - 1][1]; -} - -// function that extracts samples from a LUT (and interpolates intemediate -// frames for 10- and 12-bit video) -static int scale_LUT(int *scaling_lut, int index, int bit_depth) { - int x = index >> (bit_depth - 8); - - if (!(bit_depth - 8) || x == 255) - return scaling_lut[x]; - else - return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * - (index & ((1 << (bit_depth - 8)) - 1)) + - (1 << (bit_depth - 9))) >> - (bit_depth - 8)); -} - -static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int luma_stride, - int chroma_stride, int *luma_grain, - int *cb_grain, int *cr_grain, - int luma_grain_stride, int chroma_grain_stride, - int half_luma_height, int half_luma_width, - int bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity) { - int cb_mult = params->cb_mult - 128; // fixed scale - int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale - int cb_offset = params->cb_offset - 256; - - int cr_mult = params->cr_mult - 128; // fixed scale - int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale - int cr_offset = params->cr_offset - 256; - - int rounding_offset = (1 << (params->scaling_shift - 1)); - - int apply_y = params->num_y_points > 0 ? 1 : 0; - int apply_cb = - (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; - int apply_cr = - (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; - - if (params->chroma_scaling_from_luma) { - cb_mult = 0; // fixed scale - cb_luma_mult = 64; // fixed scale - cb_offset = 0; - - cr_mult = 0; // fixed scale - cr_luma_mult = 64; // fixed scale - cr_offset = 0; - } - - int min_luma, max_luma, min_chroma, max_chroma; - - if (params->clip_to_restricted_range) { - min_luma = min_luma_legal_range; - max_luma = max_luma_legal_range; - - if (mc_identity) { - min_chroma = min_luma_legal_range; - max_chroma = max_luma_legal_range; - } else { - min_chroma = min_chroma_legal_range; - max_chroma = max_chroma_legal_range; - } - } else { - min_luma = min_chroma = 0; - max_luma = max_chroma = 255; - } - - for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { - for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { - int average_luma = 0; - if (chroma_subsamp_x) { - average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + - (j << chroma_subsamp_x)] + - luma[(i << chroma_subsamp_y) * luma_stride + - (j << chroma_subsamp_x) + 1] + - 1) >> - 1; - } else { - average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; - } - - if (apply_cb) { - cb[i * chroma_stride + j] = clamp( - cb[i * chroma_stride + j] + - ((scale_LUT(scaling_lut_cb, - clamp(((average_luma * cb_luma_mult + - cb_mult * cb[i * chroma_stride + j]) >> - 6) + - cb_offset, - 0, (256 << (bit_depth - 8)) - 1), - 8) * - cb_grain[i * chroma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_chroma, max_chroma); - } - - if (apply_cr) { - cr[i * chroma_stride + j] = clamp( - cr[i * chroma_stride + j] + - ((scale_LUT(scaling_lut_cr, - clamp(((average_luma * cr_luma_mult + - cr_mult * cr[i * chroma_stride + j]) >> - 6) + - cr_offset, - 0, (256 << (bit_depth - 8)) - 1), - 8) * - cr_grain[i * chroma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_chroma, max_chroma); - } - } - } - - if (apply_y) { - for (int i = 0; i < (half_luma_height << 1); i++) { - for (int j = 0; j < (half_luma_width << 1); j++) { - luma[i * luma_stride + j] = - clamp(luma[i * luma_stride + j] + - ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) * - luma_grain[i * luma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_luma, max_luma); - } - } - } -} - -static void add_noise_to_block_hbd( - const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, - int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, - int *cr_grain, int luma_grain_stride, int chroma_grain_stride, - int half_luma_height, int half_luma_width, int bit_depth, - int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { - int cb_mult = params->cb_mult - 128; // fixed scale - int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale - // offset value depends on the bit depth - int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth); - - int cr_mult = params->cr_mult - 128; // fixed scale - int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale - // offset value depends on the bit depth - int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth); - - int rounding_offset = (1 << (params->scaling_shift - 1)); - - int apply_y = params->num_y_points > 0 ? 1 : 0; - int apply_cb = - (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 - : 0; - int apply_cr = - (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 - : 0; - - if (params->chroma_scaling_from_luma) { - cb_mult = 0; // fixed scale - cb_luma_mult = 64; // fixed scale - cb_offset = 0; - - cr_mult = 0; // fixed scale - cr_luma_mult = 64; // fixed scale - cr_offset = 0; - } - - int min_luma, max_luma, min_chroma, max_chroma; - - if (params->clip_to_restricted_range) { - min_luma = min_luma_legal_range << (bit_depth - 8); - max_luma = max_luma_legal_range << (bit_depth - 8); - - if (mc_identity) { - min_chroma = min_luma_legal_range << (bit_depth - 8); - max_chroma = max_luma_legal_range << (bit_depth - 8); - } else { - min_chroma = min_chroma_legal_range << (bit_depth - 8); - max_chroma = max_chroma_legal_range << (bit_depth - 8); - } - } else { - min_luma = min_chroma = 0; - max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; - } - - for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { - for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { - int average_luma = 0; - if (chroma_subsamp_x) { - average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + - (j << chroma_subsamp_x)] + - luma[(i << chroma_subsamp_y) * luma_stride + - (j << chroma_subsamp_x) + 1] + - 1) >> - 1; - } else { - average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; - } - - if (apply_cb) { - cb[i * chroma_stride + j] = clamp( - cb[i * chroma_stride + j] + - ((scale_LUT(scaling_lut_cb, - clamp(((average_luma * cb_luma_mult + - cb_mult * cb[i * chroma_stride + j]) >> - 6) + - cb_offset, - 0, (256 << (bit_depth - 8)) - 1), - bit_depth) * - cb_grain[i * chroma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_chroma, max_chroma); - } - if (apply_cr) { - cr[i * chroma_stride + j] = clamp( - cr[i * chroma_stride + j] + - ((scale_LUT(scaling_lut_cr, - clamp(((average_luma * cr_luma_mult + - cr_mult * cr[i * chroma_stride + j]) >> - 6) + - cr_offset, - 0, (256 << (bit_depth - 8)) - 1), - bit_depth) * - cr_grain[i * chroma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_chroma, max_chroma); - } - } - } - - if (apply_y) { - for (int i = 0; i < (half_luma_height << 1); i++) { - for (int j = 0; j < (half_luma_width << 1); j++) { - luma[i * luma_stride + j] = - clamp(luma[i * luma_stride + j] + - ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], - bit_depth) * - luma_grain[i * luma_grain_stride + j] + - rounding_offset) >> - params->scaling_shift), - min_luma, max_luma); - } - } - } -} - -static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int width, int height, - int use_high_bit_depth) { - int hbd_coeff = use_high_bit_depth ? 2 : 1; - while (height) { - memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff); - src += src_stride; - dst += dst_stride; - --height; - } - return; -} - -static void copy_area(int *src, int src_stride, int *dst, int dst_stride, - int width, int height) { - while (height) { - memcpy(dst, src, width * sizeof(*src)); - src += src_stride; - dst += dst_stride; - --height; - } - return; -} - -static void extend_even(uint8_t *dst, int dst_stride, int width, int height, - int use_high_bit_depth) { - if ((width & 1) == 0 && (height & 1) == 0) return; - if (use_high_bit_depth) { - uint16_t *dst16 = (uint16_t *)dst; - int dst16_stride = dst_stride / 2; - if (width & 1) { - for (int i = 0; i < height; ++i) - dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1]; - } - width = (width + 1) & (~1); - if (height & 1) { - memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride], - sizeof(*dst16) * width); - } - } else { - if (width & 1) { - for (int i = 0; i < height; ++i) - dst[i * dst_stride + width] = dst[i * dst_stride + width - 1]; - } - width = (width + 1) & (~1); - if (height & 1) { - memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride], - sizeof(*dst) * width); - } - } -} - -static void ver_boundary_overlap(int *left_block, int left_stride, - int *right_block, int right_stride, - int *dst_block, int dst_stride, int width, - int height) { - if (width == 1) { - while (height) { - *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5, - grain_min, grain_max); - left_block += left_stride; - right_block += right_stride; - dst_block += dst_stride; - --height; - } - return; - } else if (width == 2) { - while (height) { - dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5, - grain_min, grain_max); - dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5, - grain_min, grain_max); - left_block += left_stride; - right_block += right_stride; - dst_block += dst_stride; - --height; - } - return; - } -} - -static void hor_boundary_overlap(int *top_block, int top_stride, - int *bottom_block, int bottom_stride, - int *dst_block, int dst_stride, int width, - int height) { - if (height == 1) { - while (width) { - *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5, - grain_min, grain_max); - ++top_block; - ++bottom_block; - ++dst_block; - --width; - } - return; - } else if (height == 2) { - while (width) { - dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5, - grain_min, grain_max); - dst_block[dst_stride] = clamp((17 * top_block[top_stride] + - 27 * bottom_block[bottom_stride] + 16) >> - 5, - grain_min, grain_max); - ++top_block; - ++bottom_block; - ++dst_block; - --width; - } - return; - } -} - -int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, - aom_image_t *dst) { - uint8_t *luma, *cb, *cr; - int height, width, luma_stride, chroma_stride; - int use_high_bit_depth = 0; - int chroma_subsamp_x = 0; - int chroma_subsamp_y = 0; - int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0; - - switch (src->fmt) { - case AOM_IMG_FMT_AOMI420: - case AOM_IMG_FMT_I420: - use_high_bit_depth = 0; - chroma_subsamp_x = 1; - chroma_subsamp_y = 1; - break; - case AOM_IMG_FMT_I42016: - use_high_bit_depth = 1; - chroma_subsamp_x = 1; - chroma_subsamp_y = 1; - break; - // case AOM_IMG_FMT_444A: - case AOM_IMG_FMT_I444: - use_high_bit_depth = 0; - chroma_subsamp_x = 0; - chroma_subsamp_y = 0; - break; - case AOM_IMG_FMT_I44416: - use_high_bit_depth = 1; - chroma_subsamp_x = 0; - chroma_subsamp_y = 0; - break; - case AOM_IMG_FMT_I422: - use_high_bit_depth = 0; - chroma_subsamp_x = 1; - chroma_subsamp_y = 0; - break; - case AOM_IMG_FMT_I42216: - use_high_bit_depth = 1; - chroma_subsamp_x = 1; - chroma_subsamp_y = 0; - break; - default: // unknown input format - fprintf(stderr, "Film grain error: input format is not supported!"); - return -1; - } - - assert(params->bit_depth == src->bit_depth); - - dst->fmt = src->fmt; - dst->bit_depth = src->bit_depth; - - dst->r_w = src->r_w; - dst->r_h = src->r_h; - dst->d_w = src->d_w; - dst->d_h = src->d_h; - - dst->cp = src->cp; - dst->tc = src->tc; - dst->mc = src->mc; - - dst->monochrome = src->monochrome; - dst->csp = src->csp; - dst->range = src->range; - - dst->x_chroma_shift = src->x_chroma_shift; - dst->y_chroma_shift = src->y_chroma_shift; - - dst->temporal_id = src->temporal_id; - dst->spatial_id = src->spatial_id; - - width = src->d_w % 2 ? src->d_w + 1 : src->d_w; - height = src->d_h % 2 ? src->d_h + 1 : src->d_h; - - copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], - dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, - src->d_h, use_high_bit_depth); - // Note that dst is already assumed to be aligned to even. - extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, - src->d_h, use_high_bit_depth); - - if (!src->monochrome) { - copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], - dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], - width >> chroma_subsamp_x, height >> chroma_subsamp_y, - use_high_bit_depth); - - copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], - dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], - width >> chroma_subsamp_x, height >> chroma_subsamp_y, - use_high_bit_depth); - } - - luma = dst->planes[AOM_PLANE_Y]; - cb = dst->planes[AOM_PLANE_U]; - cr = dst->planes[AOM_PLANE_V]; - - // luma and chroma strides in samples - luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; - chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; - - return av1_add_film_grain_run( - params, luma, cb, cr, height, width, luma_stride, chroma_stride, - use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); -} - -int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity) { - int **pred_pos_luma; - int **pred_pos_chroma; - int *luma_grain_block; - int *cb_grain_block; - int *cr_grain_block; - - int *y_line_buf; - int *cb_line_buf; - int *cr_line_buf; - - int *y_col_buf; - int *cb_col_buf; - int *cr_col_buf; - - random_register = params->random_seed; - - int left_pad = 3; - int right_pad = 3; // padding to offset for AR coefficients - int top_pad = 3; - int bottom_pad = 0; - - int ar_padding = 3; // maximum lag used for stabilization of AR coefficients - - luma_subblock_size_y = 32; - luma_subblock_size_x = 32; - - chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; - chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; - - // Initial padding is only needed for generation of - // film grain templates (to stabilize the AR process) - // Only a 64x64 luma and 32x32 chroma part of a template - // is used later for adding grain, padding can be discarded - - int luma_block_size_y = - top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad; - int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 + - 2 * ar_padding + right_pad; - - int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + - chroma_subblock_size_y * 2 + bottom_pad; - int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + - chroma_subblock_size_x * 2 + - (2 >> chroma_subsamp_x) * ar_padding + right_pad; - - int luma_grain_stride = luma_block_size_x; - int chroma_grain_stride = chroma_block_size_x; - - int overlap = params->overlap_flag; - int bit_depth = params->bit_depth; - - grain_center = 128 << (bit_depth - 8); - grain_min = 0 - grain_center; - grain_max = (256 << (bit_depth - 8)) - 1 - grain_center; - - init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma, - &pred_pos_chroma, &luma_grain_block, &cb_grain_block, - &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, - &y_col_buf, &cb_col_buf, &cr_col_buf, - luma_block_size_y * luma_block_size_x, - chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, - chroma_subsamp_x); - - if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, - luma_block_size_y, luma_block_size_x, - luma_grain_stride, left_pad, top_pad, right_pad, - bottom_pad)) - return -1; - - if (generate_chroma_grain_blocks( - params, - // pred_pos_luma, - pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, - luma_grain_stride, chroma_block_size_y, chroma_block_size_x, - chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, - chroma_subsamp_y, chroma_subsamp_x)) - return -1; - - init_scaling_function(params->scaling_points_y, params->num_y_points, - scaling_lut_y); - - if (params->chroma_scaling_from_luma) { - memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256); - memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256); - } else { - init_scaling_function(params->scaling_points_cb, params->num_cb_points, - scaling_lut_cb); - init_scaling_function(params->scaling_points_cr, params->num_cr_points, - scaling_lut_cr); - } - for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) { - init_random_generator(y * 2, params->random_seed); - - for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) { - int offset_y = get_random_number(8); - int offset_x = (offset_y >> 4) & 15; - offset_y &= 15; - - int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1); - int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1); - - int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + - offset_y * (2 >> chroma_subsamp_y); - int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + - offset_x * (2 >> chroma_subsamp_x); - - if (overlap && x) { - ver_boundary_overlap( - y_col_buf, 2, - luma_grain_block + luma_offset_y * luma_grain_stride + - luma_offset_x, - luma_grain_stride, y_col_buf, 2, 2, - AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); - - ver_boundary_overlap( - cb_col_buf, 2 >> chroma_subsamp_x, - cb_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x, - chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_x, - AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), - (height - (y << 1)) >> chroma_subsamp_y)); - - ver_boundary_overlap( - cr_col_buf, 2 >> chroma_subsamp_x, - cr_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x, - chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_x, - AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), - (height - (y << 1)) >> chroma_subsamp_y)); - - int i = y ? 1 : 0; - - if (use_high_bit_depth) { - add_noise_to_block_hbd( - params, - (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1), - (uint16_t *)cb + - ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - (x << (1 - chroma_subsamp_x)), - (uint16_t *)cr + - ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - (x << (1 - chroma_subsamp_x)), - luma_stride, chroma_stride, y_col_buf + i * 4, - cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), - cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), - 2, (2 - chroma_subsamp_x), - AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, - bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } else { - add_noise_to_block( - params, luma + ((y + i) << 1) * luma_stride + (x << 1), - cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - (x << (1 - chroma_subsamp_x)), - cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - (x << (1 - chroma_subsamp_x)), - luma_stride, chroma_stride, y_col_buf + i * 4, - cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), - cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), - 2, (2 - chroma_subsamp_x), - AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, - bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } - } - - if (overlap && y) { - if (x) { - hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2, - y_line_buf + (x << 1), luma_stride, 2, 2); - - hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x), - chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x, - cb_line_buf + x * (2 >> chroma_subsamp_x), - chroma_stride, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_y); - - hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x), - chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x, - cr_line_buf + x * (2 >> chroma_subsamp_x), - chroma_stride, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_y); - } - - hor_boundary_overlap( - y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, - luma_grain_block + luma_offset_y * luma_grain_stride + - luma_offset_x + (x ? 2 : 0), - luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, - AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1), - width - ((x ? x + 1 : 0) << 1)), - 2); - - hor_boundary_overlap( - cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - cb_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), - chroma_grain_stride, - cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - AOMMIN(chroma_subblock_size_x - - ((x ? 1 : 0) << (1 - chroma_subsamp_x)), - (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), - 2 >> chroma_subsamp_y); - - hor_boundary_overlap( - cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - cr_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), - chroma_grain_stride, - cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - AOMMIN(chroma_subblock_size_x - - ((x ? 1 : 0) << (1 - chroma_subsamp_x)), - (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), - 2 >> chroma_subsamp_y); - - if (use_high_bit_depth) { - add_noise_to_block_hbd( - params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1), - (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + - (x << ((1 - chroma_subsamp_x))), - (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + - (x << ((1 - chroma_subsamp_x))), - luma_stride, chroma_stride, y_line_buf + (x << 1), - cb_line_buf + (x << (1 - chroma_subsamp_x)), - cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, - chroma_stride, 1, - AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, - chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } else { - add_noise_to_block( - params, luma + (y << 1) * luma_stride + (x << 1), - cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + - (x << ((1 - chroma_subsamp_x))), - cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + - (x << ((1 - chroma_subsamp_x))), - luma_stride, chroma_stride, y_line_buf + (x << 1), - cb_line_buf + (x << (1 - chroma_subsamp_x)), - cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, - chroma_stride, 1, - AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, - chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } - } - - int i = overlap && y ? 1 : 0; - int j = overlap && x ? 1 : 0; - - if (use_high_bit_depth) { - add_noise_to_block_hbd( - params, - (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), - (uint16_t *)cb + - ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - ((x + j) << (1 - chroma_subsamp_x)), - (uint16_t *)cr + - ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - ((x + j) << (1 - chroma_subsamp_x)), - luma_stride, chroma_stride, - luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + - luma_offset_x + (j << 1), - cb_grain_block + - (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * - chroma_grain_stride + - chroma_offset_x + (j << (1 - chroma_subsamp_x)), - cr_grain_block + - (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * - chroma_grain_stride + - chroma_offset_x + (j << (1 - chroma_subsamp_x)), - luma_grain_stride, chroma_grain_stride, - AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, - AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, - chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } else { - add_noise_to_block( - params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), - cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - ((x + j) << (1 - chroma_subsamp_x)), - cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + - ((x + j) << (1 - chroma_subsamp_x)), - luma_stride, chroma_stride, - luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + - luma_offset_x + (j << 1), - cb_grain_block + - (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * - chroma_grain_stride + - chroma_offset_x + (j << (1 - chroma_subsamp_x)), - cr_grain_block + - (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * - chroma_grain_stride + - chroma_offset_x + (j << (1 - chroma_subsamp_x)), - luma_grain_stride, chroma_grain_stride, - AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, - AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, - chroma_subsamp_y, chroma_subsamp_x, mc_identity); - } - - if (overlap) { - if (x) { - // Copy overlapped column bufer to line buffer - copy_area(y_col_buf + (luma_subblock_size_y << 1), 2, - y_line_buf + (x << 1), luma_stride, 2, 2); - - copy_area( - cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), - 2 >> chroma_subsamp_x, - cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, - 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); - - copy_area( - cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), - 2 >> chroma_subsamp_x, - cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, - 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); - } - - // Copy grain to the line buffer for overlap with a bottom block - copy_area( - luma_grain_block + - (luma_offset_y + luma_subblock_size_y) * luma_grain_stride + - luma_offset_x + ((x ? 2 : 0)), - luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, - AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2); - - copy_area(cb_grain_block + - (chroma_offset_y + chroma_subblock_size_y) * - chroma_grain_stride + - chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), - chroma_grain_stride, - cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - AOMMIN(chroma_subblock_size_x, - ((width - (x << 1)) >> chroma_subsamp_x)) - - (x ? 2 >> chroma_subsamp_x : 0), - 2 >> chroma_subsamp_y); - - copy_area(cr_grain_block + - (chroma_offset_y + chroma_subblock_size_y) * - chroma_grain_stride + - chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), - chroma_grain_stride, - cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), - chroma_stride, - AOMMIN(chroma_subblock_size_x, - ((width - (x << 1)) >> chroma_subsamp_x)) - - (x ? 2 >> chroma_subsamp_x : 0), - 2 >> chroma_subsamp_y); - - // Copy grain to the column buffer for overlap with the next block to - // the right - - copy_area(luma_grain_block + luma_offset_y * luma_grain_stride + - luma_offset_x + luma_subblock_size_x, - luma_grain_stride, y_col_buf, 2, 2, - AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); - - copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x + chroma_subblock_size_x, - chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_x, - AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), - (height - (y << 1)) >> chroma_subsamp_y)); - - copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride + - chroma_offset_x + chroma_subblock_size_x, - chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, - 2 >> chroma_subsamp_x, - AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), - (height - (y << 1)) >> chroma_subsamp_y)); - } - } - } - - dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, - &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, - &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); - return 0; -} diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h deleted file mode 100644 index 7aee6f6f4..000000000 --- a/third_party/aom/aom_dsp/grain_synthesis.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/*!\file - * \brief Describes film grain parameters and film grain synthesis - * - */ -#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ -#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "aom_dsp/aom_dsp_common.h" -#include "aom/aom_image.h" - -/*!\brief Structure containing film grain synthesis parameters for a frame - * - * This structure contains input parameters for film grain synthesis - */ -typedef struct { - int apply_grain; - - int update_parameters; - - // 8 bit values - int scaling_points_y[14][2]; - int num_y_points; // value: 0..14 - - // 8 bit values - int scaling_points_cb[10][2]; - int num_cb_points; // value: 0..10 - - // 8 bit values - int scaling_points_cr[10][2]; - int num_cr_points; // value: 0..10 - - int scaling_shift; // values : 8..11 - - int ar_coeff_lag; // values: 0..3 - - // 8 bit values - int ar_coeffs_y[24]; - int ar_coeffs_cb[25]; - int ar_coeffs_cr[25]; - - // Shift value: AR coeffs range - // 6: [-2, 2) - // 7: [-1, 1) - // 8: [-0.5, 0.5) - // 9: [-0.25, 0.25) - int ar_coeff_shift; // values : 6..9 - - int cb_mult; // 8 bits - int cb_luma_mult; // 8 bits - int cb_offset; // 9 bits - - int cr_mult; // 8 bits - int cr_luma_mult; // 8 bits - int cr_offset; // 9 bits - - int overlap_flag; - - int clip_to_restricted_range; - - unsigned int bit_depth; // video bit depth - - int chroma_scaling_from_luma; - - int grain_scale_shift; - - uint16_t random_seed; -} aom_film_grain_t; - -/*!\brief Add film grain - * - * Add film grain to an image - * - * Returns 0 for success, -1 for failure - * - * \param[in] grain_params Grain parameters - * \param[in] luma luma plane - * \param[in] cb cb plane - * \param[in] cr cr plane - * \param[in] height luma plane height - * \param[in] width luma plane width - * \param[in] luma_stride luma plane stride - * \param[in] chroma_stride chroma plane stride - */ -int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity); - -/*!\brief Add film grain - * - * Add film grain to an image - * - * Returns 0 for success, -1 for failure - * - * \param[in] grain_params Grain parameters - * \param[in] src Source image - * \param[out] dst Resulting image with grain - */ -int av1_add_film_grain(const aom_film_grain_t *grain_params, - const aom_image_t *src, aom_image_t *dst); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c deleted file mode 100644 index 0d6a73f55..000000000 --- a/third_party/aom/aom_dsp/grain_table.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/*!\file - * \brief This file has the implementation details of the grain table. - * - * The file format is an ascii representation for readability and - * editability. Array parameters are separated from the non-array - * parameters and prefixed with a few characters to make for easy - * localization with a parameter set. Each entry is prefixed with "E" - * and the other parameters are only specified if "update-parms" is - * non-zero. - * - * filmgrn1 - * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms> - * p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ... - * sY <num_y_points> <point_0_x> <point_0_y> ... - * sCb <num_cb_points> <point_0_x> <point_0_y> ... - * sCr <num_cr_points> <point_0_x> <point_0_y> ... - * cY <ar_coeff_y_0> .... - * cCb <ar_coeff_cb_0> .... - * cCr <ar_coeff_cr_0> .... - * E <start-time> ... - */ -#include <string.h> -#include <stdio.h> -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/grain_table.h" -#include "aom_mem/aom_mem.h" - -static const char kFileMagic[8] = "filmgrn1"; - -static void grain_table_entry_read(FILE *file, - struct aom_internal_error_info *error_info, - aom_film_grain_table_entry_t *entry) { - aom_film_grain_t *pars = &entry->params; - int num_read = - fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time, - &entry->end_time, &pars->apply_grain, &pars->random_seed, - &pars->update_parameters); - if (num_read == 0 && feof(file)) return; - if (num_read != 5) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read entry header. Read %d != 5", num_read); - return; - } - if (pars->update_parameters) { - num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n", - &pars->ar_coeff_lag, &pars->ar_coeff_shift, - &pars->grain_scale_shift, &pars->scaling_shift, - &pars->chroma_scaling_from_luma, &pars->overlap_flag, - &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset, - &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset); - if (num_read != 12) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read entry params. Read %d != 12", - num_read); - return; - } - if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read num y points"); - return; - } - for (int i = 0; i < pars->num_y_points; ++i) { - if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0], - &pars->scaling_points_y[i][1])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read y scaling points"); - return; - } - } - if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read num cb points"); - return; - } - for (int i = 0; i < pars->num_cb_points; ++i) { - if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0], - &pars->scaling_points_cb[i][1])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read cb scaling points"); - return; - } - } - if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read num cr points"); - return; - } - for (int i = 0; i < pars->num_cr_points; ++i) { - if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0], - &pars->scaling_points_cr[i][1])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read cr scaling points"); - return; - } - } - - fscanf(file, "\n\tcY"); - const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); - for (int i = 0; i < n; ++i) { - if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read Y coeffs"); - return; - } - } - fscanf(file, "\n\tcCb"); - for (int i = 0; i <= n; ++i) { - if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read Cb coeffs"); - return; - } - } - fscanf(file, "\n\tcCr"); - for (int i = 0; i <= n; ++i) { - if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read Cr coeffs"); - return; - } - } - fscanf(file, "\n"); - } -} - -void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) { - const aom_film_grain_t *pars = &entry->params; - fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time, - entry->end_time, pars->apply_grain, pars->random_seed, - pars->update_parameters); - if (pars->update_parameters) { - fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n", - pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift, - pars->scaling_shift, pars->chroma_scaling_from_luma, - pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult, - pars->cb_offset, pars->cr_mult, pars->cr_luma_mult, - pars->cr_offset); - fprintf(file, "\tsY %d ", pars->num_y_points); - for (int i = 0; i < pars->num_y_points; ++i) { - fprintf(file, " %d %d", pars->scaling_points_y[i][0], - pars->scaling_points_y[i][1]); - } - fprintf(file, "\n\tsCb %d", pars->num_cb_points); - for (int i = 0; i < pars->num_cb_points; ++i) { - fprintf(file, " %d %d", pars->scaling_points_cb[i][0], - pars->scaling_points_cb[i][1]); - } - fprintf(file, "\n\tsCr %d", pars->num_cr_points); - for (int i = 0; i < pars->num_cr_points; ++i) { - fprintf(file, " %d %d", pars->scaling_points_cr[i][0], - pars->scaling_points_cr[i][1]); - } - fprintf(file, "\n\tcY"); - const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); - for (int i = 0; i < n; ++i) { - fprintf(file, " %d", pars->ar_coeffs_y[i]); - } - fprintf(file, "\n\tcCb"); - for (int i = 0; i <= n; ++i) { - fprintf(file, " %d", pars->ar_coeffs_cb[i]); - } - fprintf(file, "\n\tcCr"); - for (int i = 0; i <= n; ++i) { - fprintf(file, " %d", pars->ar_coeffs_cr[i]); - } - fprintf(file, "\n"); - } -} - -void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp, - int64_t end_time, - const aom_film_grain_t *grain) { - if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) { - aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail)); - memset(new_tail, 0, sizeof(*new_tail)); - if (t->tail) t->tail->next = new_tail; - if (!t->head) t->head = new_tail; - t->tail = new_tail; - - new_tail->start_time = time_stamp; - new_tail->end_time = end_time; - new_tail->params = *grain; - } else { - t->tail->end_time = AOMMAX(t->tail->end_time, end_time); - t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp); - } -} - -int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, - int64_t end_time, int erase, - aom_film_grain_t *grain) { - aom_film_grain_table_entry_t *entry = t->head; - aom_film_grain_table_entry_t *prev_entry = 0; - int16_t random_seed = grain ? grain->random_seed : 0; - if (grain) memset(grain, 0, sizeof(*grain)); - - while (entry) { - aom_film_grain_table_entry_t *next = entry->next; - if (time_stamp >= entry->start_time && time_stamp < entry->end_time) { - if (grain) { - *grain = entry->params; - if (time_stamp != 0) grain->random_seed = random_seed; - } - if (!erase) return 1; - - const int64_t entry_end_time = entry->end_time; - if (time_stamp <= entry->start_time && end_time >= entry->end_time) { - if (t->tail == entry) t->tail = prev_entry; - if (prev_entry) { - prev_entry->next = entry->next; - } else { - t->head = entry->next; - } - aom_free(entry); - } else if (time_stamp <= entry->start_time && - end_time < entry->end_time) { - entry->start_time = end_time; - } else if (time_stamp > entry->start_time && - end_time >= entry->end_time) { - entry->end_time = time_stamp; - } else { - aom_film_grain_table_entry_t *new_entry = - aom_malloc(sizeof(*new_entry)); - new_entry->next = entry->next; - new_entry->start_time = end_time; - new_entry->end_time = entry->end_time; - new_entry->params = entry->params; - entry->next = new_entry; - entry->end_time = time_stamp; - if (t->tail == entry) t->tail = new_entry; - } - // If segments aren't aligned, delete from the beggining of subsequent - // segments - if (end_time > entry_end_time) { - aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0); - } - return 1; - } - prev_entry = entry; - entry = next; - } - return 0; -} - -aom_codec_err_t aom_film_grain_table_read( - aom_film_grain_table_t *t, const char *filename, - struct aom_internal_error_info *error_info) { - FILE *file = fopen(filename, "rb"); - if (!file) { - aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s", - filename); - return error_info->error_code; - } - error_info->error_code = AOM_CODEC_OK; - - // Read in one extra character as there should be white space after - // the header. - char magic[9]; - if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to read (or invalid) file magic"); - fclose(file); - return error_info->error_code; - } - - aom_film_grain_table_entry_t *prev_entry = 0; - while (!feof(file)) { - aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); - memset(entry, 0, sizeof(*entry)); - grain_table_entry_read(file, error_info, entry); - entry->next = 0; - - if (prev_entry) prev_entry->next = entry; - if (!t->head) t->head = entry; - t->tail = entry; - prev_entry = entry; - - if (error_info->error_code != AOM_CODEC_OK) break; - } - - fclose(file); - return error_info->error_code; -} - -aom_codec_err_t aom_film_grain_table_write( - const aom_film_grain_table_t *t, const char *filename, - struct aom_internal_error_info *error_info) { - error_info->error_code = AOM_CODEC_OK; - - FILE *file = fopen(filename, "wb"); - if (!file) { - aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s", - filename); - return error_info->error_code; - } - - if (!fwrite(kFileMagic, 8, 1, file)) { - aom_internal_error(error_info, AOM_CODEC_ERROR, - "Unable to write file magic"); - fclose(file); - return error_info->error_code; - } - - fprintf(file, "\n"); - aom_film_grain_table_entry_t *entry = t->head; - while (entry) { - grain_table_entry_write(file, entry); - entry = entry->next; - } - fclose(file); - return error_info->error_code; -} - -void aom_film_grain_table_free(aom_film_grain_table_t *t) { - aom_film_grain_table_entry_t *entry = t->head; - while (entry) { - aom_film_grain_table_entry_t *next = entry->next; - aom_free(entry); - entry = next; - } - memset(t, 0, sizeof(*t)); -} diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h deleted file mode 100644 index a8ac50730..000000000 --- a/third_party/aom/aom_dsp/grain_table.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/*!\file - * \brief A table mapping from time to corresponding film grain parameters. - * - * In order to apply grain synthesis in the decoder, the film grain parameters - * need to be signalled in the encoder. The film grain parameters are time - * varying, and for two-pass encoding (and denoiser implementation flexibility) - * it is common to denoise the video and do parameter estimation before encoding - * the denoised video. - * - * The film grain table is used to provide this flexibility and is used as a - * parameter that is passed to the encoder. - * - * Further, if regraining is to be done in say a single pass mode, or in two - * pass within the encoder (before frames are added to the lookahead buffer), - * this data structure can be used to keep track of on-the-fly estimated grain - * parameters, that are then extracted from the table before the encoded frame - * is written. - */ -#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_ -#define AOM_AOM_DSP_GRAIN_TABLE_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "aom_dsp/grain_synthesis.h" -#include "aom/internal/aom_codec_internal.h" - -typedef struct aom_film_grain_table_entry_t { - aom_film_grain_t params; - int64_t start_time; - int64_t end_time; - struct aom_film_grain_table_entry_t *next; -} aom_film_grain_table_entry_t; - -typedef struct { - aom_film_grain_table_entry_t *head; - aom_film_grain_table_entry_t *tail; -} aom_film_grain_table_t; - -/*!\brief Add a mapping from [time_stamp, end_time) to the given grain - * parameters - * - * \param[in/out] table The grain table - * \param[in] time_stamp The start time stamp - * \param[in] end_stamp The end time_stamp - * \param[in] grain The grain parameters - */ -void aom_film_grain_table_append(aom_film_grain_table_t *table, - int64_t time_stamp, int64_t end_time, - const aom_film_grain_t *grain); - -/*!\brief Look-up (and optionally erase) the grain parameters for the given time - * - * \param[in] table The grain table - * \param[in] time_stamp The start time stamp - * \param[in] end_stamp The end time_stamp - * \param[in] erase Whether the time segment can be deleted - * \param[out] grain The output grain parameters - */ -int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, - int64_t end_time, int erase, - aom_film_grain_t *grain); - -/*!\brief Reads the grain table from a file. - * - * \param[out] table The grain table - * \param[in] filename The file to read from - * \param[in] error_info Error info for tracking errors - */ -aom_codec_err_t aom_film_grain_table_read( - aom_film_grain_table_t *table, const char *filename, - struct aom_internal_error_info *error_info); - -/*!\brief Writes the grain table from a file. - * - * \param[out] table The grain table - * \param[in] filename The file to read from - * \param[in] error_info Error info for tracking errors - */ -aom_codec_err_t aom_film_grain_table_write( - const aom_film_grain_table_t *t, const char *filename, - struct aom_internal_error_info *error_info); - -void aom_film_grain_table_free(aom_film_grain_table_t *t); - -#ifdef __cplusplus -} -#endif - -#endif // AOM_AOM_DSP_GRAIN_TABLE_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c deleted file mode 100644 index c6aa6b207..000000000 --- a/third_party/aom/aom_dsp/intrapred.c +++ /dev/null @@ -1,792 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <math.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/intrapred_common.h" -#include "aom_mem/aom_mem.h" -#include "aom_ports/bitops.h" - -static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left) { - int r; - (void)left; - - for (r = 0; r < bh; r++) { - memcpy(dst, above, bw); - dst += stride; - } -} - -static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left) { - int r; - (void)above; - - for (r = 0; r < bh; r++) { - memset(dst, left[r], bw); - dst += stride; - } -} - -static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } - -static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, - uint16_t top_left) { - const int base = top + left - top_left; - const int p_left = abs_diff(base, left); - const int p_top = abs_diff(base, top); - const int p_top_left = abs_diff(base, top_left); - - // Return nearest to base of left, top and top_left. - return (p_left <= p_top && p_left <= p_top_left) - ? left - : (p_top <= p_top_left) ? top : top_left; -} - -static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - const uint8_t ytop_left = above[-1]; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); - dst += stride; - } -} - -// Some basic checks on weights for smooth predictor. -#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ - pred_scale) \ - assert(weights_w[0] < weights_scale); \ - assert(weights_h[0] < weights_scale); \ - assert(weights_scale - weights_w[bw - 1] < weights_scale); \ - assert(weights_scale - weights_h[bh - 1] < weights_scale); \ - assert(pred_scale < 31) // ensures no overflow when calculating predictor. - -#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) - -static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel - const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights_w = sm_weight_arrays + bw; - const uint8_t *const sm_weights_h = sm_weight_arrays + bh; - // scale = 2 * 2^sm_weight_log2_scale - const int log2_scale = 1 + sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, - log2_scale + sizeof(*dst)); - int r; - for (r = 0; r < bh; ++r) { - int c; - for (c = 0; c < bw; ++c) { - const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; - const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], - sm_weights_w[c], scale - sm_weights_w[c] }; - uint32_t this_pred = 0; - int i; - assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); - for (i = 0; i < 4; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel - const uint8_t *const sm_weights = sm_weight_arrays + bh; - // scale = 2^sm_weight_log2_scale - const int log2_scale = sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, sm_weights, scale, - log2_scale + sizeof(*dst)); - - int r; - for (r = 0; r < bh; r++) { - int c; - for (c = 0; c < bw; ++c) { - const uint8_t pixels[] = { above[c], below_pred }; - const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; - uint32_t this_pred = 0; - assert(scale >= sm_weights[r]); - int i; - for (i = 0; i < 2; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bw; - // scale = 2^sm_weight_log2_scale - const int log2_scale = sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, sm_weights, scale, - log2_scale + sizeof(*dst)); - - int r; - for (r = 0; r < bh; r++) { - int c; - for (c = 0; c < bw; ++c) { - const uint8_t pixels[] = { left[r], right_pred }; - const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; - uint32_t this_pred = 0; - assert(scale >= sm_weights[c]); - int i; - for (i = 0; i < 2; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r; - (void)above; - (void)left; - - for (r = 0; r < bh; r++) { - memset(dst, 128, bw); - dst += stride; - } -} - -static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int i, r, expected_dc, sum = 0; - (void)above; - - for (i = 0; i < bh; i++) sum += left[i]; - expected_dc = (sum + (bh >> 1)) / bh; - - for (r = 0; r < bh; r++) { - memset(dst, expected_dc, bw); - dst += stride; - } -} - -static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int i, r, expected_dc, sum = 0; - (void)left; - - for (i = 0; i < bw; i++) sum += above[i]; - expected_dc = (sum + (bw >> 1)) / bw; - - for (r = 0; r < bh; r++) { - memset(dst, expected_dc, bw); - dst += stride; - } -} - -static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left) { - int i, r, expected_dc, sum = 0; - const int count = bw + bh; - - for (i = 0; i < bw; i++) { - sum += above[i]; - } - for (i = 0; i < bh; i++) { - sum += left[i]; - } - - expected_dc = (sum + (count >> 1)) / count; - - for (r = 0; r < bh; r++) { - memset(dst, expected_dc, bw); - dst += stride; - } -} - -static INLINE int divide_using_multiply_shift(int num, int shift1, - int multiplier, int shift2) { - const int interm = num >> shift1; - return interm * multiplier >> shift2; -} - - // The constants (multiplier and shifts) for a given block size are obtained - // as follows: - // - Let sum_w_h = block width + block height. - // - Shift 'sum_w_h' right until we reach an odd number. Let the number of - // shifts for that block size be called 'shift1' (see the parameter in - // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 - // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect - // block]. - // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, - // using the "Algorithm 1" in: - // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 - // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd - // shift will be 16, regardless of the block size. - - // Note: For low bitdepth, assembly code may be optimized by using smaller - // constants for smaller block sizes, where the range of the 'sum' is - // restricted to fewer bits. - -#define DC_MULTIPLIER_1X2 0x5556 -#define DC_MULTIPLIER_1X4 0x3334 - -#define DC_SHIFT2 16 - -static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left, int shift1, - int multiplier) { - int sum = 0; - - for (int i = 0; i < bw; i++) { - sum += above[i]; - } - for (int i = 0; i < bh; i++) { - sum += left[i]; - } - - const int expected_dc = divide_using_multiply_shift( - sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); - assert(expected_dc < (1 << 8)); - - for (int r = 0; r < bh; r++) { - memset(dst, expected_dc, bw); - dst += stride; - } -} - -#undef DC_SHIFT2 - -void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); -} - -void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); -} - -void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); -} - -#undef DC_MULTIPLIER_1X2 -#undef DC_MULTIPLIER_1X4 - -static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void)left; - (void)bd; - for (r = 0; r < bh; r++) { - memcpy(dst, above, bw * sizeof(uint16_t)); - dst += stride; - } -} - -static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void)above; - (void)bd; - for (r = 0; r < bh; r++) { - aom_memset16(dst, left[r], bw); - dst += stride; - } -} - -static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - const uint16_t ytop_left = above[-1]; - (void)bd; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); - dst += stride; - } -} - -static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel - const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights_w = sm_weight_arrays + bw; - const uint8_t *const sm_weights_h = sm_weight_arrays + bh; - // scale = 2 * 2^sm_weight_log2_scale - const int log2_scale = 1 + sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, - log2_scale + sizeof(*dst)); - int r; - for (r = 0; r < bh; ++r) { - int c; - for (c = 0; c < bw; ++c) { - const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; - const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], - sm_weights_w[c], scale - sm_weights_w[c] }; - uint32_t this_pred = 0; - int i; - assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); - for (i = 0; i < 4; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel - const uint8_t *const sm_weights = sm_weight_arrays + bh; - // scale = 2^sm_weight_log2_scale - const int log2_scale = sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, sm_weights, scale, - log2_scale + sizeof(*dst)); - - int r; - for (r = 0; r < bh; r++) { - int c; - for (c = 0; c < bw; ++c) { - const uint16_t pixels[] = { above[c], below_pred }; - const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; - uint32_t this_pred = 0; - assert(scale >= sm_weights[r]); - int i; - for (i = 0; i < 2; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bw; - // scale = 2^sm_weight_log2_scale - const int log2_scale = sm_weight_log2_scale; - const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, sm_weights, scale, - log2_scale + sizeof(*dst)); - - int r; - for (r = 0; r < bh; r++) { - int c; - for (c = 0; c < bw; ++c) { - const uint16_t pixels[] = { left[r], right_pred }; - const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; - uint32_t this_pred = 0; - assert(scale >= sm_weights[c]); - int i; - for (i = 0; i < 2; ++i) { - this_pred += weights[i] * pixels[i]; - } - dst[c] = divide_round(this_pred, log2_scale); - } - dst += stride; - } -} - -static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void)above; - (void)left; - - for (r = 0; r < bh; r++) { - aom_memset16(dst, 128 << (bd - 8), bw); - dst += stride; - } -} - -static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - (void)above; - (void)bd; - - for (i = 0; i < bh; i++) sum += left[i]; - expected_dc = (sum + (bh >> 1)) / bh; - - for (r = 0; r < bh; r++) { - aom_memset16(dst, expected_dc, bw); - dst += stride; - } -} - -static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - (void)left; - (void)bd; - - for (i = 0; i < bw; i++) sum += above[i]; - expected_dc = (sum + (bw >> 1)) / bw; - - for (r = 0; r < bh; r++) { - aom_memset16(dst, expected_dc, bw); - dst += stride; - } -} - -static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - const int count = bw + bh; - (void)bd; - - for (i = 0; i < bw; i++) { - sum += above[i]; - } - for (i = 0; i < bh; i++) { - sum += left[i]; - } - - expected_dc = (sum + (count >> 1)) / count; - - for (r = 0; r < bh; r++) { - aom_memset16(dst, expected_dc, bw); - dst += stride; - } -} - -// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but -// assume 2nd shift of 17 bits instead of 16. -// Note: Strictly speaking, 2nd shift needs to be 17 only when: -// - bit depth == 12, and -// - bw + bh is divisible by 5 (as opposed to divisible by 3). -// All other cases can use half the multipliers with a shift of 16 instead. -// This special optimization can be used when writing assembly code. -#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB -// Note: This constant is odd, but a smaller even constant (0x199a) with the -// appropriate shift should work for neon in 8/10-bit. -#define HIGHBD_DC_MULTIPLIER_1X4 0x6667 - -#define HIGHBD_DC_SHIFT2 17 - -static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, - const uint16_t *above, - const uint16_t *left, int bd, - int shift1, uint32_t multiplier) { - int sum = 0; - (void)bd; - - for (int i = 0; i < bw; i++) { - sum += above[i]; - } - for (int i = 0; i < bh; i++) { - sum += left[i]; - } - - const int expected_dc = divide_using_multiply_shift( - sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); - assert(expected_dc < (1 << bd)); - - for (int r = 0; r < bh; r++) { - aom_memset16(dst, expected_dc, bw); - dst += stride; - } -} - -#undef HIGHBD_DC_SHIFT2 - -void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, - HIGHBD_DC_MULTIPLIER_1X4); -} - -void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, - HIGHBD_DC_MULTIPLIER_1X2); -} - -void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, - HIGHBD_DC_MULTIPLIER_1X2); -} - -#undef HIGHBD_DC_MULTIPLIER_1X2 -#undef HIGHBD_DC_MULTIPLIER_1X4 - -// This serves as a wrapper function, so that all the prediction functions -// can be unified and accessed as a pointer array. Note that the boundary -// above and left are not necessarily used all the time. -#define intra_pred_sized(type, width, height) \ - void aom_##type##_predictor_##width##x##height##_c( \ - uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ - const uint8_t *left) { \ - type##_predictor(dst, stride, width, height, above, left); \ - } - -#define intra_pred_highbd_sized(type, width, height) \ - void aom_highbd_##type##_predictor_##width##x##height##_c( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ - highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \ - } - -/* clang-format off */ -#define intra_pred_rectangular(type) \ - intra_pred_sized(type, 4, 8) \ - intra_pred_sized(type, 8, 4) \ - intra_pred_sized(type, 8, 16) \ - intra_pred_sized(type, 16, 8) \ - intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) \ - intra_pred_sized(type, 32, 64) \ - intra_pred_sized(type, 64, 32) \ - intra_pred_sized(type, 4, 16) \ - intra_pred_sized(type, 16, 4) \ - intra_pred_sized(type, 8, 32) \ - intra_pred_sized(type, 32, 8) \ - intra_pred_sized(type, 16, 64) \ - intra_pred_sized(type, 64, 16) \ - intra_pred_highbd_sized(type, 4, 8) \ - intra_pred_highbd_sized(type, 8, 4) \ - intra_pred_highbd_sized(type, 8, 16) \ - intra_pred_highbd_sized(type, 16, 8) \ - intra_pred_highbd_sized(type, 16, 32) \ - intra_pred_highbd_sized(type, 32, 16) \ - intra_pred_highbd_sized(type, 32, 64) \ - intra_pred_highbd_sized(type, 64, 32) \ - intra_pred_highbd_sized(type, 4, 16) \ - intra_pred_highbd_sized(type, 16, 4) \ - intra_pred_highbd_sized(type, 8, 32) \ - intra_pred_highbd_sized(type, 32, 8) \ - intra_pred_highbd_sized(type, 16, 64) \ - intra_pred_highbd_sized(type, 64, 16) -#define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8, 8) \ - intra_pred_sized(type, 16, 16) \ - intra_pred_sized(type, 32, 32) \ - intra_pred_sized(type, 64, 64) \ - intra_pred_highbd_sized(type, 4, 4) \ - intra_pred_highbd_sized(type, 8, 8) \ - intra_pred_highbd_sized(type, 16, 16) \ - intra_pred_highbd_sized(type, 32, 32) \ - intra_pred_highbd_sized(type, 64, 64) \ - intra_pred_rectangular(type) -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 4, 4) \ - intra_pred_above_4x4(type) -#define intra_pred_square(type) \ - intra_pred_sized(type, 4, 4) \ - intra_pred_sized(type, 8, 8) \ - intra_pred_sized(type, 16, 16) \ - intra_pred_sized(type, 32, 32) \ - intra_pred_sized(type, 64, 64) \ - intra_pred_highbd_sized(type, 4, 4) \ - intra_pred_highbd_sized(type, 8, 8) \ - intra_pred_highbd_sized(type, 16, 16) \ - intra_pred_highbd_sized(type, 32, 32) \ - intra_pred_highbd_sized(type, 64, 64) - -intra_pred_allsizes(v) -intra_pred_allsizes(h) -intra_pred_allsizes(smooth) -intra_pred_allsizes(smooth_v) -intra_pred_allsizes(smooth_h) -intra_pred_allsizes(paeth) -intra_pred_allsizes(dc_128) -intra_pred_allsizes(dc_left) -intra_pred_allsizes(dc_top) -intra_pred_square(dc) -/* clang-format on */ -#undef intra_pred_allsizes diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h deleted file mode 100644 index 3ec62a86e..000000000 --- a/third_party/aom/aom_dsp/intrapred_common.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ -#define AOM_AOM_DSP_INTRAPRED_COMMON_H_ - -#include "config/aom_config.h" - -// Weights are quadratic from '1' to '1 / block_size', scaled by -// 2^sm_weight_log2_scale. -static const int sm_weight_log2_scale = 8; - -// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) -#define MAX_BLOCK_DIM 64 - -/* clang-format off */ -static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { - // Unused, because we always offset by bs, which is at least 2. - 0, 0, - // bs = 2 - 255, 128, - // bs = 4 - 255, 149, 85, 64, - // bs = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // bs = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // bs = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, - // bs = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, - 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, - 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, -}; -/* clang-format on */ - -#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c deleted file mode 100644 index a3f261824..000000000 --- a/third_party/aom/aom_dsp/loopfilter.c +++ /dev/null @@ -1,925 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -static INLINE int8_t signed_char_clamp(int t) { - return (int8_t)clamp(t, -128, 127); -} - -static INLINE int16_t signed_char_clamp_high(int t, int bd) { - switch (bd) { - case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); - case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); - case 8: - default: return (int16_t)clamp(t, -128, 128 - 1); - } -} - -// should we apply any filter at all: 11111111 yes, 00000000 no -static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, - uint8_t p0, uint8_t q0, uint8_t q1) { - int8_t mask = 0; - mask |= (abs(p1 - p0) > limit) * -1; - mask |= (abs(q1 - q0) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - return ~mask; -} - -static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, - uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, - uint8_t q1, uint8_t q2, uint8_t q3) { - int8_t mask = 0; - mask |= (abs(p3 - p2) > limit) * -1; - mask |= (abs(p2 - p1) > limit) * -1; - mask |= (abs(p1 - p0) > limit) * -1; - mask |= (abs(q1 - q0) > limit) * -1; - mask |= (abs(q2 - q1) > limit) * -1; - mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - return ~mask; -} - -static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, - uint8_t p2, uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, uint8_t q2) { - int8_t mask = 0; - mask |= (abs(p2 - p1) > limit) * -1; - mask |= (abs(p1 - p0) > limit) * -1; - mask |= (abs(q1 - q0) > limit) * -1; - mask |= (abs(q2 - q1) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - return ~mask; -} - -static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, - uint8_t p0, uint8_t q0, uint8_t q1, - uint8_t q2) { - int8_t mask = 0; - mask |= (abs(p1 - p0) > thresh) * -1; - mask |= (abs(q1 - q0) > thresh) * -1; - mask |= (abs(p2 - p0) > thresh) * -1; - mask |= (abs(q2 - q0) > thresh) * -1; - return ~mask; -} - -static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, - uint8_t q2, uint8_t q3) { - int8_t mask = 0; - mask |= (abs(p1 - p0) > thresh) * -1; - mask |= (abs(q1 - q0) > thresh) * -1; - mask |= (abs(p2 - p0) > thresh) * -1; - mask |= (abs(q2 - q0) > thresh) * -1; - mask |= (abs(p3 - p0) > thresh) * -1; - mask |= (abs(q3 - q0) > thresh) * -1; - return ~mask; -} - -// is there high edge variance internal edge: 11111111 yes, 00000000 no -static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1) { - int8_t hev = 0; - hev |= (abs(p1 - p0) > thresh) * -1; - hev |= (abs(q1 - q0) > thresh) * -1; - return hev; -} - -static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, - uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { - int8_t filter1, filter2; - - const int8_t ps1 = (int8_t)*op1 ^ 0x80; - const int8_t ps0 = (int8_t)*op0 ^ 0x80; - const int8_t qs0 = (int8_t)*oq0 ^ 0x80; - const int8_t qs1 = (int8_t)*oq1 ^ 0x80; - const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); - - // add outer taps if we have high edge variance - int8_t filter = signed_char_clamp(ps1 - qs1) & hev; - - // inner taps - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; - - // save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way - filter1 = signed_char_clamp(filter + 4) >> 3; - filter2 = signed_char_clamp(filter + 3) >> 3; - - *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; - *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; - - // outer tap adjustments - filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - - *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; - *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; -} - -void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint8_t p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p]; - const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); - filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); - ++s; - } -} - -void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint8_t p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1]; - const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); - filter4(mask, *thresh, s - 2, s - 1, s, s + 1); - s += pitch; - } -} - -void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); - aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); -} - -static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, - uint8_t *op2, uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { - if (flat && mask) { - const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; - - // 5-tap filter [1, 2, 2, 2, 1] - *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); - *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); - *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); - *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); - } else { - filter4(mask, thresh, op1, op0, oq0, oq1); - } -} - -static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, - uint8_t *op3, uint8_t *op2, uint8_t *op1, - uint8_t *op0, uint8_t *oq0, uint8_t *oq1, - uint8_t *oq2, uint8_t *oq3) { - if (flat && mask) { - const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - - // 7-tap filter [1, 1, 1, 2, 1, 1, 1] - *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); - *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); - *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); - *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); - *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); - *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); - } else { - filter4(mask, thresh, op1, op0, oq0, oq1); - } -} - -void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; - - const int8_t mask = - filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); - const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); - filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, - s + 2 * p); - ++s; - } -} - -void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1); -} - -void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p); - ++s; - } -} - -void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - int count = 4; - - for (i = 0; i < count; ++i) { - const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; - const int8_t mask = - filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); - const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); - filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); - s += pitch; - } -} - -void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); - aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - int count = 4; - - for (i = 0; i < count; ++i) { - const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, - s + 3); - s += pitch; - } -} - -void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); - aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); -} - -static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint8_t *op6, uint8_t *op5, - uint8_t *op4, uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, uint8_t *oq0, - uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, - uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { - if (flat2 && flat && mask) { - const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, - p1 = *op1, p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, - q5 = *oq5, q6 = *oq6; - - // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] - *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, - 4); - *op4 = ROUND_POWER_OF_TWO( - p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); - *op3 = ROUND_POWER_OF_TWO( - p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); - *op2 = ROUND_POWER_OF_TWO( - p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, - 4); - *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + - q0 + q1 + q2 + q3 + q4, - 4); - *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + - q0 * 2 + q1 + q2 + q3 + q4 + q5, - 4); - *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + - q1 * 2 + q2 + q3 + q4 + q5 + q6, - 4); - *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + - q2 * 2 + q3 + q4 + q5 + q6 * 2, - 4); - *oq2 = ROUND_POWER_OF_TWO( - p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, - 4); - *oq3 = ROUND_POWER_OF_TWO( - p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO( - p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, - 4); - } else { - filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); - } -} - -static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int count) { - int i; - int step = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < step * count; ++i) { - const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], - p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], - q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); - - filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, - s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); - ++s; - } -} - -void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); -} - -void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); - mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1); -} - -static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int count) { - int i; - - for (i = 0; i < count; ++i) { - const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], - p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], - q5 = s[5], q6 = s[6]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); - - filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, - s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); - s += p; - } -} - -void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); -} - -void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); - mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4); -} - -// Should we apply any filter at all: 11111111 yes, 00000000 no ? -static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, - uint16_t p1, uint16_t p0, uint16_t q0, - uint16_t q1, int bd) { - int8_t mask = 0; - int16_t limit16 = (uint16_t)limit << (bd - 8); - int16_t blimit16 = (uint16_t)blimit << (bd - 8); - mask |= (abs(p1 - p0) > limit16) * -1; - mask |= (abs(q1 - q0) > limit16) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; - return ~mask; -} - -// Should we apply any filter at all: 11111111 yes, 00000000 no ? -static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, - uint16_t p3, uint16_t p2, uint16_t p1, - uint16_t p0, uint16_t q0, uint16_t q1, - uint16_t q2, uint16_t q3, int bd) { - int8_t mask = 0; - int16_t limit16 = (uint16_t)limit << (bd - 8); - int16_t blimit16 = (uint16_t)blimit << (bd - 8); - mask |= (abs(p3 - p2) > limit16) * -1; - mask |= (abs(p2 - p1) > limit16) * -1; - mask |= (abs(p1 - p0) > limit16) * -1; - mask |= (abs(q1 - q0) > limit16) * -1; - mask |= (abs(q2 - q1) > limit16) * -1; - mask |= (abs(q3 - q2) > limit16) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; - return ~mask; -} - -static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, - uint16_t p2, uint16_t p1, - uint16_t p0, uint16_t q0, - uint16_t q1, uint16_t q2, - int bd) { - int8_t mask = 0; - int16_t limit16 = (uint16_t)limit << (bd - 8); - int16_t blimit16 = (uint16_t)blimit << (bd - 8); - mask |= (abs(p2 - p1) > limit16) * -1; - mask |= (abs(p1 - p0) > limit16) * -1; - mask |= (abs(q1 - q0) > limit16) * -1; - mask |= (abs(q2 - q1) > limit16) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; - return ~mask; -} - -static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, - uint16_t p1, uint16_t p0, - uint16_t q0, uint16_t q1, - uint16_t q2, int bd) { - int8_t mask = 0; - int16_t thresh16 = (uint16_t)thresh << (bd - 8); - mask |= (abs(p1 - p0) > thresh16) * -1; - mask |= (abs(q1 - q0) > thresh16) * -1; - mask |= (abs(p2 - p0) > thresh16) * -1; - mask |= (abs(q2 - q0) > thresh16) * -1; - return ~mask; -} - -static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, - uint16_t p1, uint16_t p0, uint16_t q0, - uint16_t q1, uint16_t q2, uint16_t q3, - int bd) { - int8_t mask = 0; - int16_t thresh16 = (uint16_t)thresh << (bd - 8); - mask |= (abs(p1 - p0) > thresh16) * -1; - mask |= (abs(q1 - q0) > thresh16) * -1; - mask |= (abs(p2 - p0) > thresh16) * -1; - mask |= (abs(q2 - q0) > thresh16) * -1; - mask |= (abs(p3 - p0) > thresh16) * -1; - mask |= (abs(q3 - q0) > thresh16) * -1; - return ~mask; -} - -// Is there high edge variance internal edge: -// 11111111_11111111 yes, 00000000_00000000 no ? -static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, - uint16_t q0, uint16_t q1, int bd) { - int16_t hev = 0; - int16_t thresh16 = (uint16_t)thresh << (bd - 8); - hev |= (abs(p1 - p0) > thresh16) * -1; - hev |= (abs(q1 - q0) > thresh16) * -1; - return hev; -} - -static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, - uint16_t *op0, uint16_t *oq0, uint16_t *oq1, - int bd) { - int16_t filter1, filter2; - // ^0x80 equivalent to subtracting 0x80 from the values to turn them - // into -128 to +127 instead of 0 to 255. - int shift = bd - 8; - const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); - const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); - const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); - const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); - const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); - - // Add outer taps if we have high edge variance. - int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; - - // Inner taps. - filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; - - // Save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way. - filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; - filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; - - *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); - *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); - - // Outer tap adjustments. - filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - - *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); - *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); -} - -void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int bd) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const int8_t mask = - highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); - highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); - ++s; - } -} - -void aom_highbd_lpf_horizontal_4_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); -} - -void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint16_t p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1]; - const int8_t mask = - highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); - highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); - s += pitch; - } -} - -void aom_highbd_lpf_vertical_4_dual_c( - uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, - bd); -} - -static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, - uint16_t *op2, uint16_t *op1, uint16_t *op0, - uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, - int bd) { - if (flat && mask) { - const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; - const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; - - // 5-tap filter [1, 2, 2, 2, 1] - *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); - *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); - *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); - *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); - } else { - highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); - } -} - -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, - uint16_t *op3, uint16_t *op2, uint16_t *op1, - uint16_t *op0, uint16_t *oq0, uint16_t *oq1, - uint16_t *oq2, uint16_t *oq3, int bd) { - if (flat && mask) { - const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - - // 7-tap filter [1, 1, 1, 2, 1, 1, 1] - *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); - *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); - *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); - *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); - *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); - *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); - } else { - highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); - } -} - -void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = - highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, - s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); - ++s; - } -} - -void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - int i; - int count = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < count; ++i) { - const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; - - const int8_t mask = - highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); - const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); - highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, bd); - ++s; - } -} - -void aom_highbd_lpf_horizontal_6_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd); -} - -void aom_highbd_lpf_horizontal_8_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); -} - -void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - int i; - int count = 4; - - for (i = 0; i < count; ++i) { - const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; - const int8_t mask = - highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); - const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); - highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, - bd); - s += pitch; - } -} - -void aom_highbd_lpf_vertical_6_dual_c( - uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, - bd); -} - -void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - int i; - int count = 4; - - for (i = 0; i < count; ++i) { - const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = - highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, - s + 2, s + 3, bd); - s += pitch; - } -} - -void aom_highbd_lpf_vertical_8_dual_c( - uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, - bd); -} - -static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint16_t *op6, uint16_t *op5, - uint16_t *op4, uint16_t *op3, uint16_t *op2, - uint16_t *op1, uint16_t *op0, uint16_t *oq0, - uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, - uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, - int bd) { - if (flat2 && flat && mask) { - const uint16_t p6 = *op6; - const uint16_t p5 = *op5; - const uint16_t p4 = *op4; - const uint16_t p3 = *op3; - const uint16_t p2 = *op2; - const uint16_t p1 = *op1; - const uint16_t p0 = *op0; - const uint16_t q0 = *oq0; - const uint16_t q1 = *oq1; - const uint16_t q2 = *oq2; - const uint16_t q3 = *oq3; - const uint16_t q4 = *oq4; - const uint16_t q5 = *oq5; - const uint16_t q6 = *oq6; - - // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] - *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, - 4); - *op4 = ROUND_POWER_OF_TWO( - p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); - *op3 = ROUND_POWER_OF_TWO( - p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); - *op2 = ROUND_POWER_OF_TWO( - p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, - 4); - *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + - q0 + q1 + q2 + q3 + q4, - 4); - *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + - q0 * 2 + q1 + q2 + q3 + q4 + q5, - 4); - *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + - q1 * 2 + q2 + q3 + q4 + q5 + q6, - 4); - *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + - q2 * 2 + q3 + q4 + q5 + q6 * 2, - 4); - *oq2 = ROUND_POWER_OF_TWO( - p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, - 4); - *oq3 = ROUND_POWER_OF_TWO( - p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO( - p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, - 4); - } else { - highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, - bd); - } -} - -static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int count, - int bd) { - int i; - int step = 4; - - // loop filter designed to work using chars so that we can make maximum use - // of 8 bit simd instructions. - for (i = 0; i < step * count; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = - highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - - const int8_t flat2 = - highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], - s[5 * p], s[6 * p], bd); - - highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, - s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); - ++s; - } -} - -void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); -} - -void aom_highbd_lpf_horizontal_14_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd); - highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd); -} - -static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int count, - int bd) { - int i; - - for (i = 0; i < count; ++i) { - const uint16_t p3 = s[-4]; - const uint16_t p2 = s[-3]; - const uint16_t p1 = s[-2]; - const uint16_t p0 = s[-1]; - const uint16_t q0 = s[0]; - const uint16_t q1 = s[1]; - const uint16_t q2 = s[2]; - const uint16_t q3 = s[3]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = - highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat2 = - highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); - - highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, - s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, - s + 6, bd); - s += p; - } -} - -void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); -} - -void aom_highbd_lpf_vertical_14_dual_c( - uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd); - highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, - 4, bd); -} diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c deleted file mode 100644 index 96d04cff0..000000000 --- a/third_party/aom/aom_dsp/mips/add_noise_msa.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "aom_dsp/mips/macros_msa.h" - -void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, - char blackclamp[16], char whiteclamp[16], - char bothclamp[16], uint32_t width, - uint32_t height, int32_t pitch) { - uint32_t i, j; - - for (i = 0; i < height / 2; ++i) { - uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; - int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); - uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; - int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); - for (j = width / 16; j--;) { - v16i8 temp00_s, temp01_s; - v16u8 temp00, temp01, black_clamp, white_clamp; - v16u8 pos0, ref0, pos1, ref1; - v16i8 const127 = __msa_ldi_b(127); - - pos0 = LD_UB(pos0_ptr); - ref0 = LD_UB(ref0_ptr); - pos1 = LD_UB(pos1_ptr); - ref1 = LD_UB(ref1_ptr); - black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); - white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); - temp00 = (pos0 < black_clamp); - pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); - temp01 = (pos1 < black_clamp); - pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); - XORI_B2_128_UB(pos0, pos1); - temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp00 = (v16u8)(temp00_s < pos0); - pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); - temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp01 = (temp01_s < pos1); - pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); - XORI_B2_128_UB(pos0, pos1); - pos0 += ref0; - ST_UB(pos0, pos0_ptr); - pos1 += ref1; - ST_UB(pos1, pos1_ptr); - pos0_ptr += 16; - pos1_ptr += 16; - ref0_ptr += 16; - ref1_ptr += 16; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c deleted file mode 100644 index 363fad308..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 mask0, mask1, mask2, mask3, out; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v8i16 filt, out0, out1; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1); - SRARI_H2_SH(out0, out1, FILTER_BITS); - SAT_SH2_SH(out0, out1, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 filt0, filt1, filt2, filt3; - v16i8 src0, src1, src2, src3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1); - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, out0, out1, out2, - out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - tmp0 = PCKEV_XORI128_UB(out0, out1); - tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); -} - -static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - tmp0 = PCKEV_XORI128_UB(out0, out1); - tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_SB2(src, src_stride, src0, src2); - LD_SB2(src + 8, src_stride, src1, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (2 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - dst += dst_stride; - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst); - dst += dst_stride; - } -} - -static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - dst += dst_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 16); - - src0 = LD_SB(src + 32); - src2 = LD_SB(src + 48); - src3 = LD_SB(src + 56); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - out = PCKEV_XORI128_UB(out0, out1); - ST_UB(out, dst + 32); - out = PCKEV_XORI128_UB(out2, out3); - ST_UB(out, dst + 48); - dst += dst_stride; - } -} - -static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, vec0, vec1, res0, res1; - v8u16 vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); - SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 vec0, vec1, vec2, vec3, filt0; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16i8 res0, res1, res2, res3; - v8u16 vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, - vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 filt0; - v16i8 src0, src1, src2, src3, mask; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); - ST8x4_UB(src0, src1, dst, dst_stride); -} - -static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - v16u8 filt0; - v16i8 src0, src1, src2, src3, mask, out0, out1; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - if (16 == height) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); - } -} - -static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - loop_cnt = (height >> 2) - 1; - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - dst += dst_stride; - PCKEV_ST_SB(out2, out3, dst); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - dst += dst_stride; - PCKEV_ST_SB(out6, out7, dst); - dst += dst_stride; - - for (; loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - dst += dst_stride; - PCKEV_ST_SB(out2, out3, dst); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - dst += dst_stride; - PCKEV_ST_SB(out6, out7, dst); - dst += dst_stride; - } -} - -static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height >> 1; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - src4 = LD_SB(src); - src6 = LD_SB(src + 16); - src7 = LD_SB(src + 24); - src5 = __msa_sldi_b(src6, src4, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - PCKEV_ST_SB(out2, out3, dst + 16); - dst += dst_stride; - PCKEV_ST_SB(out4, out5, dst); - PCKEV_ST_SB(out6, out7, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src4 = LD_SB(src + 32); - src6 = LD_SB(src + 48); - src7 = LD_SB(src + 56); - SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_ST_SB(out0, out1, dst); - PCKEV_ST_SB(out2, out3, dst + 16); - PCKEV_ST_SB(out4, out5, dst + 32); - PCKEV_ST_SB(out6, out7, dst + 48); - dst += dst_stride; - } -} - -void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8]; - - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0) { - switch (w) { - case 4: - common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 8: - common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 16: - common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 32: - common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - case 64: - common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_hor[3], h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 8: - common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 16: - common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 32: - common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - case 64: - common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_hor, h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c deleted file mode 100644 index aa962b41f..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c +++ /dev/null @@ -1,701 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; - v16i8 src10998, filt0, filt1, filt2, filt3; - v16u8 out; - v8i16 filt, out10, out32; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, - src4332, src6554); - XORI_B3_128_SB(src2110, src4332, src6554); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); - XORI_B2_128_SB(src8776, src10998); - out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, - filt1, filt2, filt3); - out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, - filt1, filt2, filt3); - SRARI_H2_SH(out10, out32, FILTER_BITS); - SAT_SH2_SH(out10, out32, 7); - out = PCKEV_XORI128_UB(out10, out32); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - src2110 = src6554; - src4332 = src8776; - src6554 = src10998; - src6 = src10; - } -} - -static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; - v16u8 tmp0, tmp1; - v8i16 filt, out0_r, out1_r, out2_r, out3_r; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); - tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src6 = src10; - } -} - -static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt0, filt1, filt2, filt3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16u8 tmp0, tmp1, tmp2, tmp3; - v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, - tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } -} - -static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height, - int32_t width) { - const uint8_t *src_tmp; - uint8_t *dst_tmp; - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt0, filt1, filt2, filt3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16u8 tmp0, tmp1, tmp2, tmp3; - v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src_tmp += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src_tmp += (4 * src_stride); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, - out3_r, tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); - dst_tmp += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } - - src += 16; - dst += 16; - } -} - -static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - 32); -} - -static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, - 64); -} - -static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4; - v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; - v16u8 filt0; - v8i16 filt; - v8u16 tmp0, tmp1; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - src += (5 * src_stride); - - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 filt; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - src8 = LD_SB(src); - src += src_stride; - - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, - src76_r, src87_r); - ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, - src76_r, src2110, src4332, src6554, src8776); - DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); -} - -static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter) { - v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; - v16i8 out0, out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); - ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); -} - -static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v16i8 out0, out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); - src += (8 * src_stride); - - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, - vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - src0 = src8; - } -} - -static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - if (4 == height) { - common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); - } -} - -static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - dst += dst_stride; - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst); - dst += dst_stride; - - src0 = src4; - } -} - -static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src5 = LD_UB(src + 16); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - - LD_UB4(src + 16, src_stride, src6, src7, src8, src9); - src += (4 * src_stride); - - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); - - ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); - ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 16); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); - - ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); - ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); - dst += (4 * dst_stride); - - src0 = src4; - src5 = src9; - } -} - -static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB4(src, 16, src0, src3, src6, src9); - src += src_stride; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_UB2(src, src_stride, src1, src2); - LD_UB2(src + 16, src_stride, src4, src5); - LD_UB2(src + 32, src_stride, src7, src8); - LD_UB2(src + 48, src_stride, src10, src11); - src += (2 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); - - ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); - ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_ST_SB(tmp4, tmp5, dst + 16); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); - - ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); - ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_ST_SB(tmp0, tmp1, dst + 32); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); - - ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); - ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_ST_SB(tmp4, tmp5, dst + 48); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); - dst += (2 * dst_stride); - - src0 = src2; - src3 = src5; - src6 = src8; - src9 = src11; - } -} - -void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 8; cnt--;) { - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 8: - common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 16: - common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 32: - common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - case 64: - common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - &filt_ver[3], h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 8: - common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 16: - common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 32: - common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - case 64: - common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, - filt_ver, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c deleted file mode 100644 index f7f116f4d..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <string.h> -#include "aom_dsp/mips/macros_msa.h" - -static void copy_width8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - out4 = __msa_copy_u_d((v2i64)src4, 0); - out5 = __msa_copy_u_d((v2i64)src5, 0); - out6 = __msa_copy_u_d((v2i64)src6, 0); - out7 = __msa_copy_u_d((v2i64)src7, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - for (cnt = height >> 3; cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - out4 = __msa_copy_u_d((v2i64)src4, 0); - out5 = __msa_copy_u_d((v2i64)src5, 0); - out6 = __msa_copy_u_d((v2i64)src6, 0); - out7 = __msa_copy_u_d((v2i64)src7, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 4) { - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - out2 = __msa_copy_u_d((v2i64)src2, 0); - out3 = __msa_copy_u_d((v2i64)src3, 0); - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 2) { - for (cnt = (height / 2); cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - out0 = __msa_copy_u_d((v2i64)src0, 0); - out1 = __msa_copy_u_d((v2i64)src1, 0); - - SD(out0, dst); - dst += dst_stride; - SD(out1, dst); - dst += dst_stride; - } - } -} - -static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int32_t height, int32_t width) { - int32_t cnt, loop_cnt; - const uint8_t *src_tmp; - uint8_t *dst_tmp; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, - src7); - src_tmp += (8 * src_stride); - - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, - dst_stride); - dst_tmp += (8 * dst_stride); - } - - src += 16; - dst += 16; - } -} - -static void copy_width16_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); - dst += (8 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); - } else if (0 == height % 4) { - for (cnt = (height >> 2); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } - } -} - -static void copy_width32_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - - if (0 == height % 12) { - for (cnt = (height / 12); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == height % 8) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); - } else if (0 == height % 4) { - for (cnt = (height >> 2); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(src + 16, src_stride, src4, src5, src6, src7); - src += (4 * src_stride); - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); - dst += (4 * dst_stride); - } - } -} - -static void copy_width64_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); -} - -void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - - switch (w) { - case 4: { - uint32_t cnt, tmp; - /* 1 word storage */ - for (cnt = h; cnt--;) { - tmp = LW(src); - SW(tmp, dst); - src += src_stride; - dst += dst_stride; - } - break; - } - case 8: { - copy_width8_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 16: { - copy_width16_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 32: { - copy_width32_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 64: { - copy_width64_msa(src, src_stride, dst, dst_stride, h); - break; - } - default: { - uint32_t cnt; - for (cnt = h; cnt--;) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h deleted file mode 100644 index 852415c20..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ -#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/aom_filter.h" - -extern const uint8_t mc_filt_mask_arr[16 * 3]; - -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ - filt3) \ - ({ \ - v8i16 tmp_dpadd_0, tmp_dpadd_1; \ - \ - tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ - tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ - tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ - \ - tmp_dpadd_0; \ - }) - -#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, mask3, filt0, filt1, filt2, filt3, \ - out0, out1) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ - ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ - } - -#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, mask3, filt0, filt1, filt2, filt3, \ - out0, out1, out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ - res4_m, res5_m, res6_m, res7_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ - res4_m, res5_m, res6_m, res7_m); \ - ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ - res7_m, out0, out1, out2, out3); \ - } - -#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c deleted file mode 100644 index 00ab75dc3..000000000 --- a/third_party/aom/aom_dsp/mips/common_dspr2.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; -uint8_t *aom_ff_cropTbl; - -void aom_dsputil_static_init(void) { - int i; - - for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; - - for (i = 0; i < CROP_WIDTH; i++) { - aom_ff_cropTbl_a[i] = 0; - aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; - } - - aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; -} - -#endif diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h deleted file mode 100644 index c42188d62..000000000 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif -#if HAVE_DSPR2 -#define CROP_WIDTH 512 - -extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" - -static INLINE void prefetch_load(const unsigned char *src) { - __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); -} - -/* prefetch data for store */ -static INLINE void prefetch_store(unsigned char *dst) { - __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); -} - -static INLINE void prefetch_load_streamed(const unsigned char *src) { - __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); -} - -/* prefetch data for store */ -static INLINE void prefetch_store_streamed(unsigned char *dst) { - __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c deleted file mode 100644 index 08bf1ab30..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c +++ /dev/null @@ -1,1031 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_horiz_4_transposed_dspr2( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - int32_t Temp1, Temp2; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - dst_ptr = dst; - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp2](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p1], %[Temp1](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [src] "r"(src), [dst_stride] "r"(dst_stride)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_bi_horiz_8_transposed_dspr2( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4; - uint8_t *odd_dst; - uint32_t dst_pitch_2 = (dst_stride << 1); - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - dst_ptr = dst; - odd_dst = (dst_ptr + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "extp %[p3], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[Temp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[Temp1], %[p3](%[cm]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[Temp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[tp3], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[tp3], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[p1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p2], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p1], 0(%[odd_dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), - [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_bi_horiz_16_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "ulw %[qload1], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload1] " - "\n\t" - "ulw %[qload2], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += 1; - } -} - -static void convolve_bi_horiz_64_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "ulw %[qload1], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload1] " - "\n\t" - "ulw %[qload2], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += 1; - } -} - -void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, int w, int h) { - int x, y; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - int sum = 0; - - sum += src[x] * filter[3]; - sum += src[x + 1] * filter[4]; - - dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - } - - src += src_stride; - dst += 1; - } -} - -void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter, int w, - int h) { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - - switch (w) { - case 4: - convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - case 8: - convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - case 16: - case 32: - convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h, (w / 16)); - break; - case 64: - prefetch_load(src + 32); - convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, - filter, h); - break; - default: - convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, - h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c deleted file mode 100644 index 097da73ca..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ /dev/null @@ -1,681 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p1], %[Temp2](%[cm]) \n\t" - "lbux %[p2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst]) \n\t" - "sb %[p1], 1(%[dst]) \n\t" - "sb %[tp2], 2(%[dst]) \n\t" - "sb %[p2], 3(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=&r"(Temp4) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4; - uint32_t st0, st1; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[st0], 0(%[dst]) \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[st1], 2(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[st0], 4(%[dst]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[st1], 1(%[dst]) \n\t" - "sb %[st0], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[p1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 3(%[dst]) \n\t" - "sb %[p2], 5(%[dst]) \n\t" - "sb %[p1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(x_step_q4 == 16); - - prefetch_load((const uint8_t *)filter_x); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 8: - convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 16: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 1); - break; - case 32: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - default: - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c deleted file mode 100644 index 40abfd89e..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(y_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, - h); - break; - case 64: - prefetch_store(dst + 32); - convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c deleted file mode 100644 index af54b4264..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { - int x, y; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: { - uint32_t tp1; - - /* 1 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], (%[src]) \n\t" - "sw %[tp1], (%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 8: { - uint32_t tp1, tp2; - - /* 2 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 16: { - uint32_t tp1, tp2, tp3, tp4; - - /* 4 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 32: { - uint32_t tp1, tp2, tp3, tp4; - uint32_t tp5, tp6, tp7, tp8; - - /* 8 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - "ulw %[tp5], 16(%[src]) \n\t" - "ulw %[tp6], 20(%[src]) \n\t" - "ulw %[tp7], 24(%[src]) \n\t" - "ulw %[tp8], 28(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), - [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - case 64: { - uint32_t tp1, tp2, tp3, tp4; - uint32_t tp5, tp6, tp7, tp8; - - prefetch_load(src + 64); - prefetch_store(dst + 32); - - /* 16 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_load(src + src_stride + 64); - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "ulw %[tp4], 12(%[src]) \n\t" - "ulw %[tp5], 16(%[src]) \n\t" - "ulw %[tp6], 20(%[src]) \n\t" - "ulw %[tp7], 24(%[src]) \n\t" - "ulw %[tp8], 28(%[src]) \n\t" - - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ - - "ulw %[tp1], 32(%[src]) \n\t" - "ulw %[tp2], 36(%[src]) \n\t" - "ulw %[tp3], 40(%[src]) \n\t" - "ulw %[tp4], 44(%[src]) \n\t" - "ulw %[tp5], 48(%[src]) \n\t" - "ulw %[tp6], 52(%[src]) \n\t" - "ulw %[tp7], 56(%[src]) \n\t" - "ulw %[tp8], 60(%[src]) \n\t" - - "sw %[tp1], 32(%[dst]) \n\t" /* store */ - "sw %[tp2], 36(%[dst]) \n\t" /* store */ - "sw %[tp3], 40(%[dst]) \n\t" /* store */ - "sw %[tp4], 44(%[dst]) \n\t" /* store */ - "sw %[tp5], 48(%[dst]) \n\t" /* store */ - "sw %[tp6], 52(%[dst]) \n\t" /* store */ - "sw %[tp7], 56(%[dst]) \n\t" /* store */ - "sw %[tp8], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), - [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - } break; - default: - for (y = h; y--;) { - for (x = 0; x < w; ++x) { - dst[x] = src[x]; - } - - src += src_stride; - dst += dst_stride; - } - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c deleted file mode 100644 index f9c6879ab..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c +++ /dev/null @@ -1,879 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t n1, n2, n3, n4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[n1], %[tp2] \n\t" - "preceu.ph.qbl %[n2], %[tp2] \n\t" - "preceu.ph.qbr %[n3], %[tn2] \n\t" - "preceu.ph.qbl %[n4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[n1], %[tn1] \n\t" - "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[tn1], %[Temp2](%[cm]) \n\t" - "lbux %[n2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst]) \n\t" - "sb %[tn1], 1(%[dst]) \n\t" - "sb %[tp2], 2(%[dst]) \n\t" - "sb %[n2], 3(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), - [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4, n1; - uint32_t tn1, tn2, tn3; - uint32_t st0, st1; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "preceu.ph.qbl %[n1], %[tn2] \n\t" - "ulw %[tn1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tn1] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[st0], 0(%[dst]) \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - - "balign %[tn3], %[tn1], 3 \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[st1], 2(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "sb %[st0], 4(%[dst]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "preceu.ph.qbl %[n1], %[tn1] \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[tn3] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[st1], 1(%[dst]) \n\t" - "sb %[st0], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[n1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 3(%[dst]) \n\t" - "sb %[p2], 5(%[dst]) \n\t" - "sb %[n1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), - [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, - uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, - uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ - "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ - "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), - [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), - [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - prefetch_load((const uint8_t *)filter_x); - src -= 3; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 8: - convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - case 16: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 1); - break; - case 32: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filter_x, (int32_t)h); - break; - default: - aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c deleted file mode 100644 index 201e66427..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); - break; - case 64: - prefetch_store(dst + 32); - convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); - break; - default: - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} - -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h deleted file mode 100644 index e5d48a884..000000000 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter, int w, - int h); - -void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c deleted file mode 100644 index 7c221ae89..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; - - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - "lb %[tmp9], 8(%[left]) \n\t" - "lb %[tmp10], 9(%[left]) \n\t" - "lb %[tmp11], 10(%[left]) \n\t" - "lb %[tmp12], 11(%[left]) \n\t" - "lb %[tmp13], 12(%[left]) \n\t" - "lb %[tmp14], 13(%[left]) \n\t" - "lb %[tmp15], 14(%[left]) \n\t" - "lb %[tmp16], 15(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - "replv.qb %[tmp9], %[tmp9] \n\t" - "replv.qb %[tmp10], %[tmp10] \n\t" - "replv.qb %[tmp11], %[tmp11] \n\t" - "replv.qb %[tmp12], %[tmp12] \n\t" - "replv.qb %[tmp13], %[tmp13] \n\t" - "replv.qb %[tmp14], %[tmp14] \n\t" - "replv.qb %[tmp15], %[tmp15] \n\t" - "replv.qb %[tmp16], %[tmp16] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "sw %[tmp1], 8(%[dst]) \n\t" - "sw %[tmp1], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "sw %[tmp2], 8(%[dst]) \n\t" - "sw %[tmp2], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "sw %[tmp3], 8(%[dst]) \n\t" - "sw %[tmp3], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "sw %[tmp4], 8(%[dst]) \n\t" - "sw %[tmp4], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "sw %[tmp5], 8(%[dst]) \n\t" - "sw %[tmp5], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "sw %[tmp6], 8(%[dst]) \n\t" - "sw %[tmp6], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "sw %[tmp7], 8(%[dst]) \n\t" - "sw %[tmp7], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - "sw %[tmp8], 8(%[dst]) \n\t" - "sw %[tmp8], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp9], (%[dst]) \n\t" - "sw %[tmp9], 4(%[dst]) \n\t" - "sw %[tmp9], 8(%[dst]) \n\t" - "sw %[tmp9], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp10], (%[dst]) \n\t" - "sw %[tmp10], 4(%[dst]) \n\t" - "sw %[tmp10], 8(%[dst]) \n\t" - "sw %[tmp10], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp11], (%[dst]) \n\t" - "sw %[tmp11], 4(%[dst]) \n\t" - "sw %[tmp11], 8(%[dst]) \n\t" - "sw %[tmp11], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp12], (%[dst]) \n\t" - "sw %[tmp12], 4(%[dst]) \n\t" - "sw %[tmp12], 8(%[dst]) \n\t" - "sw %[tmp12], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp13], (%[dst]) \n\t" - "sw %[tmp13], 4(%[dst]) \n\t" - "sw %[tmp13], 8(%[dst]) \n\t" - "sw %[tmp13], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp14], (%[dst]) \n\t" - "sw %[tmp14], 4(%[dst]) \n\t" - "sw %[tmp14], 8(%[dst]) \n\t" - "sw %[tmp14], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp15], (%[dst]) \n\t" - "sw %[tmp15], 4(%[dst]) \n\t" - "sw %[tmp15], 8(%[dst]) \n\t" - "sw %[tmp15], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp16], (%[dst]) \n\t" - "sw %[tmp16], 4(%[dst]) \n\t" - "sw %[tmp16], 8(%[dst]) \n\t" - "sw %[tmp16], 12(%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), - [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), - [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), - [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), - [tmp16] "=&r"(tmp16) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, left2; - - __asm__ __volatile__( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "lw %[above1], 8(%[above]) \n\t" - "lw %[above2], 12(%[above]) \n\t" - "lw %[left1], 8(%[left]) \n\t" - "lw %[left2], 12(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addiu %[average], %[average], 16 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 5 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), - [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), - [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), - [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c deleted file mode 100644 index 0a21979c7..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4; - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "sw %[tmp1], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; - - __asm__ __volatile__( - "lw %[above_c], (%[above]) \n\t" - "lw %[left_c], (%[left]) \n\t" - - "preceu.ph.qbl %[above_l], %[above_c] \n\t" - "preceu.ph.qbr %[above_r], %[above_c] \n\t" - "preceu.ph.qbl %[left_l], %[left_c] \n\t" - "preceu.ph.qbr %[left_r], %[left_c] \n\t" - - "addu.ph %[average], %[above_r], %[above_l] \n\t" - "addu.ph %[average], %[average], %[left_l] \n\t" - "addu.ph %[average], %[average], %[left_r] \n\t" - "addiu %[average], %[average], 4 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 3 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - - : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), - [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), - [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), - [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c deleted file mode 100644 index d42a77c80..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/common_dspr2.h" - -#if HAVE_DSPR2 -void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - (void)above; - - __asm__ __volatile__( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - - : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), - [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), - [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) - : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); -} - -void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; - - __asm__ __volatile__( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "preceu.ph.qbl %[above_l2], %[above2] \n\t" - "preceu.ph.qbr %[above_r2], %[above2] \n\t" - "preceu.ph.qbl %[left_l2], %[left2] \n\t" - "preceu.ph.qbr %[left_r2], %[left2] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addu.ph %[average], %[average], %[above_l2] \n\t" - "addu.ph %[average], %[average], %[above_r2] \n\t" - "addu.ph %[average], %[average], %[left_l2] \n\t" - "addu.ph %[average], %[average], %[left_r2] \n\t" - - "addiu %[average], %[average], 8 \n\t" - - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 4 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), - [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), - [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), - [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), - [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), - [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), - [average] "=&r"(average), [tmp] "=&r"(tmp), - [expected_dc] "=&r"(expected_dc) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride)); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c deleted file mode 100644 index 9f25cc1ca..000000000 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ - { \ - out0 = __msa_subs_u_h(out0, in0); \ - out1 = __msa_subs_u_h(out1, in1); \ - } - -static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t src_data; - - src_data = LW(src); - - SW4(src_data, src_data, src_data, src_data, dst, dst_stride); -} - -static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint32_t src_data1, src_data2; - - src_data1 = LW(src); - src_data2 = LW(src + 4); - - for (row = 8; row--;) { - SW(src_data1, dst); - SW(src_data2, (dst + 4)); - dst += dst_stride; - } -} - -static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src0; - - src0 = LD_UB(src); - - for (row = 16; row--;) { - ST_UB(src0, dst); - dst += dst_stride; - } -} - -static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src1, src2; - - src1 = LD_UB(src); - src2 = LD_UB(src + 16); - - for (row = 32; row--;) { - ST_UB2(src1, src2, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t out0, out1, out2, out3; - - out0 = src[0] * 0x01010101; - out1 = src[1] * 0x01010101; - out2 = src[2] * 0x01010101; - out3 = src[3] * 0x01010101; - - SW4(out0, out1, out2, out3, dst, dst_stride); -} - -static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - out0 = src[0] * 0x0101010101010101ull; - out1 = src[1] * 0x0101010101010101ull; - out2 = src[2] * 0x0101010101010101ull; - out3 = src[3] * 0x0101010101010101ull; - out4 = src[4] * 0x0101010101010101ull; - out5 = src[5] * 0x0101010101010101ull; - out6 = src[6] * 0x0101010101010101ull; - out7 = src[7] * 0x0101010101010101ull; - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); -} - -static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 4; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 8; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB2(src0, src0, dst, 16); - dst += dst_stride; - ST_UB2(src1, src1, dst, 16); - dst += dst_stride; - ST_UB2(src2, src2, dst, 16); - dst += dst_stride; - ST_UB2(src3, src3, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_4x4_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t val0, val1; - v16i8 store, src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LW(src_top); - val1 = LW(src_left); - INSERT_W2_SB(val0, val1, src); - sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t val0; - v16i8 store, data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - - val0 = LW(src); - data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); - sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_w((v4i32)store, 0); - - SW4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_8x8_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint64_t val0, val1; - v16i8 store; - v16u8 src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src_top); - val1 = LD(src_left); - INSERT_D2_UB(val0, val1, src); - sum_h = __msa_hadd_u_h(src, src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t val0; - v16i8 store; - v16u8 data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src); - data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { - uint64_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_d((v2i64)store, 0); - - SD4(out, out, out, out, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_16x16_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - v16u8 top, left, out; - v8u16 sum_h, sum_top, sum_left; - v4u32 sum_w; - v2u64 sum_d; - - top = LD_UB(src_top); - left = LD_UB(src_left); - HADD_UB2_UH(top, left, sum_top, sum_left); - sum_h = sum_top + sum_left; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - v16u8 data, out; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - data = LD_UB(src); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { - const v16u8 out = (v16u8)__msa_ldi_b(128); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 top0, top1, left0, left1, out; - v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src_top, 16, top0, top1); - LD_UB2(src_left, 16, left0, left1); - HADD_UB2_UH(top0, top1, sum_top0, sum_top1); - HADD_UB2_UH(left0, left1, sum_left0, sum_left1); - sum_h = sum_top0 + sum_top1; - sum_h += sum_left0 + sum_left1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 data0, data1, out; - v8u16 sum_h, sum_data0, sum_data1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src, 16, data0, data1); - HADD_UB2_UH(data0, data1, sum_data0, sum_data1); - sum_h = sum_data0 + sum_data1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t row; - const v16u8 out = (v16u8)__msa_ldi_b(128); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_4x4_msa(above, dst, y_stride); -} - -void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_8x8_msa(above, dst, y_stride); -} - -void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_16x16_msa(above, dst, y_stride); -} - -void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_32x32_msa(above, dst, y_stride); -} - -void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_4x4_msa(left, dst, y_stride); -} - -void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_8x8_msa(left, dst, y_stride); -} - -void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_16x16_msa(left, dst, y_stride); -} - -void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_32x32_msa(left, dst, y_stride); -} - -void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_4x4_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_8x8_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_16x16_msa(above, left, dst, y_stride); -} - -void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_32x32_msa(above, left, dst, y_stride); -} - -void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_4x4_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_8x8_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_16x16_msa(above, dst, y_stride); -} - -void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_32x32_msa(above, dst, y_stride); -} - -void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_4x4_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_8x8_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_16x16_msa(left, dst, y_stride); -} - -void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_32x32_msa(left, dst, y_stride); -} - -void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_4x4_msa(dst, y_stride); -} - -void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_8x8_msa(dst, y_stride); -} - -void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_16x16_msa(dst, y_stride); -} - -void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_32x32_msa(dst, y_stride); -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c deleted file mode 100644 index 38a10e9b2..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c +++ /dev/null @@ -1,1488 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_ports/mem.h" -#include "aom_dsp/mips/loopfilter_msa.h" - -int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - - /* load vector elements */ - LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); - - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { - v16u8 flat, flat2, filter8; - v16i8 zero = { 0 }; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; - v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; - v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; - v8i16 l_out, r_out; - - flat = LD_UB(filter48 + 96); - - LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - src -= 3 * pitch; - ST_UB4(p2, p1, p0, q0, src, pitch); - src += (4 * pitch); - ST_UB2(q1, q2, src, pitch); - } else { - src -= 7 * pitch; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, - p5_l_in, p4_l_in); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, - p1_l_in, p0_l_in); - q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); - - tmp0_l = p7_l_in << 3; - tmp0_l -= p7_l_in; - tmp0_l += p6_l_in; - tmp0_l += q0_l_in; - tmp1_l = p6_l_in + p5_l_in; - tmp1_l += p4_l_in; - tmp1_l += p3_l_in; - tmp1_l += p2_l_in; - tmp1_l += p1_l_in; - tmp1_l += p0_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST_UB(p6, src); - src += pitch; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); - tmp0_l = p5_l_in - p6_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST_UB(p5, src); - src += pitch; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); - - q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); - tmp0_l = p4_l_in - p5_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST_UB(p4, src); - src += pitch; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); - tmp0_l = p3_l_in - p4_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST_UB(p3, src); - src += pitch; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); - tmp0_l = p2_l_in - p3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); - tmp0_l = p1_l_in - p2_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); - tmp0_l = p0_l_in - p1_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); - tmp0_l = q7_l_in - p0_l_in; - tmp0_l += q0_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q0_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p6_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q1_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p5_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += pitch; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q2_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p4_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST_UB(q3, src); - src += pitch; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p3_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST_UB(q4, src); - src += pitch; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q4_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p2_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST_UB(q5, src); - src += pitch; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - tmp0_l = q7_l_in - q5_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p1_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST_UB(q6, src); - } -} - -static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { - DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); - uint8_t early_exit = 0; - - (void)count; - - early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, - limit_ptr, thresh_ptr); - - if (0 == early_exit) { - aom_hz_lpf_t16_16w(src, pitch, filter48); - } -} - -static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, int32_t count) { - if (1 == count) { - uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; - uint64_t dword0, dword1; - v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 p0_filter16, p1_filter16; - v8i16 p2_filter8, p1_filter8, p0_filter8; - v8i16 q0_filter8, q1_filter8, q2_filter8; - v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; - v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; - v16i8 zero = { 0 }; - v8u16 tmp0, tmp1, tmp2; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, - q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); - } else { - /* convert 8 bit input data into 16 bit */ - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, - p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, - q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); - PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); - - /* load 16 vector elements */ - LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); - LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - p2_d = __msa_copy_u_d((v2i64)p2_out, 0); - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - q2_d = __msa_copy_u_d((v2i64)q2_out, 0); - - SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); - SD(q1_d, src + pitch); - SD(q2_d, src + 2 * pitch); - } else { - /* LSB(right) 8 pixel operation */ - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, - zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, - q7_r); - - tmp0 = p7_r << 3; - tmp0 -= p7_r; - tmp0 += p6_r; - tmp0 += q0_r; - - src -= 7 * pitch; - - /* calculation of p6 and p5 */ - tmp1 = p6_r + p5_r + p4_r + p3_r; - tmp1 += (p2_r + p1_r + p0_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp0 = p5_r - p6_r + q1_r - p7_r; - tmp1 += tmp0; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p4 and p3 */ - tmp0 = p4_r - p5_r + q2_r - p7_r; - tmp2 = p3_r - p4_r + q3_r - p7_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p2 and p1 */ - tmp0 = p2_r - p3_r + q4_r - p7_r; - tmp2 = p1_r - p2_r + q5_r - p7_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of p0 and q0 */ - tmp0 = (p0_r - p1_r) + (q6_r - p7_r); - tmp2 = (q7_r - p0_r) + (q0_r - p7_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q1 and q2 */ - tmp0 = q7_r - q0_r + q1_r - p6_r; - tmp2 = q7_r - q1_r + q2_r - p5_r; - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q3 and q4 */ - tmp0 = (q7_r - q2_r) + (q3_r - p4_r); - tmp2 = (q7_r - q3_r) + (q4_r - p3_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - src += pitch; - - /* calculation of q5 and q6 */ - tmp0 = (q7_r - q4_r) + (q5_r - p2_r); - tmp2 = (q7_r - q5_r) + (q6_r - p1_r); - tmp1 += tmp0; - p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - tmp1 += tmp2; - p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); - PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, - p1_filter16); - p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); - p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); - dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); - dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); - SD(dword0, src); - src += pitch; - SD(dword1, src); - } - } - } else { - mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, - count); - } -} - -void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); -} - -void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); -} - -static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, - uint8_t *output, int32_t out_pitch) { - v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; - v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - - LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, - p1_org, p0_org); - /* 8x8 transpose */ - TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, - p0_org, p7, p6, p5, p4, p3, p2, p1, p0); - /* 8x8 transpose */ - ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, - tmp0, tmp1, tmp2, tmp3); - ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); - ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); - ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); - ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); - SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); - - ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); - output += (8 * out_pitch); - ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); -} - -static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, - uint8_t *output, int32_t out_pitch) { - v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - - LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); - TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, - q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); - ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); -} - -static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, - int32_t out_pitch) { - v16u8 row0, row1, row2, row3, row4, row5, row6, row7; - v16u8 row8, row9, row10, row11, row12, row13, row14, row15; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; - v4i32 tmp2, tmp3; - - LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); - input += (8 * in_pitch); - LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); - - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, - row9, row10, row11, row12, row13, row14, row15, p7, p6, - p5, p4, p3, p2, p1, p0); - - /* transpose 16x8 matrix into 8x16 */ - /* total 8 intermediate register and 32 instructions */ - q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); - q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); - q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); - q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); - q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); - q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); - q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); - q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); - - ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); - tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); - tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); - - ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); - tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); - tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); - - ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); - q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); - tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); - q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); - q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); - tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); - q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); - q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); - - ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); - output += (8 * out_pitch); - ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); -} - -int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch_org, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v16i8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3; - - /* load vector elements */ - LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - /* convert 16 bit output data into 8 bit */ - p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); - p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); - p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); - q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); - q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); - q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { - v16i8 zero = { 0 }; - v16u8 filter8, flat, flat2; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 tmp0_r, tmp1_r; - v8i16 r_out; - - flat = LD_UB(filter48 + 6 * 16); - - LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - v8i16 vec0, vec1, vec2, vec3, vec4; - - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); - - src_org -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 4, (src_org + 4), pitch); - - return 1; - } else { - src -= 7 * 16; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST8x1_UB(p6, src); - src += 16; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST8x1_UB(p5, src); - src += 16; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST8x1_UB(p4, src); - src += 16; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST8x1_UB(p3, src); - src += 16; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST8x1_UB(filter8, src); - src += 16; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST8x1_UB(q3, src); - src += 16; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST8x1_UB(q4, src); - src += 16; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST8x1_UB(q5, src); - src += 16; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST8x1_UB(q6, src); - - return 0; - } -} - -void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint8_t early_exit = 0; - DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); - uint8_t *filter48 = &transposed_input[16 * 16]; - - transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); - - early_exit = - aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); - - if (0 == early_exit) { - early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); - - if (0 == early_exit) { - transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); - } - } -} - -int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16i8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - - /* load vector elements */ - LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec4, vec5); - - src_org -= 2; - ST4x8_UB(vec2, vec3, src_org, pitch); - src_org += 8 * pitch; - ST4x8_UB(vec4, vec5, src_org, pitch); - - return 1; - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); - filter48 += (4 * 16); - ST_UB2(q1_out, q2_out, filter48, 16); - filter48 += (2 * 16); - ST_UB(flat, filter48); - - return 0; - } -} - -int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { - v16u8 flat, flat2, filter8; - v16i8 zero = { 0 }; - v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; - v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; - v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; - v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; - v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; - v8i16 l_out, r_out; - - flat = LD_UB(filter48 + 6 * 16); - - LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); - LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); - - AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); - - if (__msa_test_bz_v(flat2)) { - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - - LD_UB4(filter48, 16, p2, p1, p0, q0); - LD_UB2(filter48 + 4 * 16, 16, q1, q2); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec6, vec7); - ILVRL_B2_SH(q2, q1, vec2, vec5); - - src_org -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec2, 4, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec5, 0, (src_org + 4), pitch); - src_org += (4 * pitch); - ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); - ST2x4_UB(vec5, 4, (src_org + 4), pitch); - - return 1; - } else { - src -= 7 * 16; - - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, - p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, - p2_r_in, p1_r_in, p0_r_in); - q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); - - tmp0_r = p7_r_in << 3; - tmp0_r -= p7_r_in; - tmp0_r += p6_r_in; - tmp0_r += q0_r_in; - tmp1_r = p6_r_in + p5_r_in; - tmp1_r += p4_r_in; - tmp1_r += p3_r_in; - tmp1_r += p2_r_in; - tmp1_r += p1_r_in; - tmp1_r += p0_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - - ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, - p5_l_in, p4_l_in); - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, - p1_l_in, p0_l_in); - q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); - - tmp0_l = p7_l_in << 3; - tmp0_l -= p7_l_in; - tmp0_l += p6_l_in; - tmp0_l += q0_l_in; - tmp1_l = p6_l_in + p5_l_in; - tmp1_l += p4_l_in; - tmp1_l += p3_l_in; - tmp1_l += p2_l_in; - tmp1_l += p1_l_in; - tmp1_l += p0_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); - ST_UB(p6, src); - src += 16; - - /* p5 */ - q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); - tmp0_r = p5_r_in - p6_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); - tmp0_l = p5_l_in - p6_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); - ST_UB(p5, src); - src += 16; - - /* p4 */ - q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); - tmp0_r = p4_r_in - p5_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); - tmp0_l = p4_l_in - p5_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); - ST_UB(p4, src); - src += 16; - - /* p3 */ - q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); - tmp0_r = p3_r_in - p4_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); - tmp0_l = p3_l_in - p4_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); - ST_UB(p3, src); - src += 16; - - /* p2 */ - q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); - filter8 = LD_UB(filter48); - tmp0_r = p2_r_in - p3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); - tmp0_l = p2_l_in - p3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* p1 */ - q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); - filter8 = LD_UB(filter48 + 16); - tmp0_r = p1_r_in - p2_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); - tmp0_l = p1_l_in - p2_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)(tmp1_l), 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* p0 */ - q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); - filter8 = LD_UB(filter48 + 32); - tmp0_r = p0_r_in - p1_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); - tmp0_l = p0_l_in - p1_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q0 */ - q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); - filter8 = LD_UB(filter48 + 48); - tmp0_r = q7_r_in - p0_r_in; - tmp0_r += q0_r_in; - tmp0_r -= p7_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); - tmp0_l = q7_l_in - p0_l_in; - tmp0_l += q0_l_in; - tmp0_l -= p7_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q1 */ - filter8 = LD_UB(filter48 + 64); - tmp0_r = q7_r_in - q0_r_in; - tmp0_r += q1_r_in; - tmp0_r -= p6_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q0_l_in; - tmp0_l += q1_l_in; - tmp0_l -= p6_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q2 */ - filter8 = LD_UB(filter48 + 80); - tmp0_r = q7_r_in - q1_r_in; - tmp0_r += q2_r_in; - tmp0_r -= p5_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q1_l_in; - tmp0_l += q2_l_in; - tmp0_l -= p5_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); - ST_UB(filter8, src); - src += 16; - - /* q3 */ - tmp0_r = q7_r_in - q2_r_in; - tmp0_r += q3_r_in; - tmp0_r -= p4_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q2_l_in; - tmp0_l += q3_l_in; - tmp0_l -= p4_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); - ST_UB(q3, src); - src += 16; - - /* q4 */ - tmp0_r = q7_r_in - q3_r_in; - tmp0_r += q4_r_in; - tmp0_r -= p3_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q3_l_in; - tmp0_l += q4_l_in; - tmp0_l -= p3_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); - ST_UB(q4, src); - src += 16; - - /* q5 */ - tmp0_r = q7_r_in - q4_r_in; - tmp0_r += q5_r_in; - tmp0_r -= p2_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q4_l_in; - tmp0_l += q5_l_in; - tmp0_l -= p2_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); - ST_UB(q5, src); - src += 16; - - /* q6 */ - tmp0_r = q7_r_in - q5_r_in; - tmp0_r += q6_r_in; - tmp0_r -= p1_r_in; - tmp1_r += tmp0_r; - r_out = __msa_srari_h((v8i16)tmp1_r, 4); - tmp0_l = q7_l_in - q5_l_in; - tmp0_l += q6_l_in; - tmp0_l -= p1_l_in; - tmp1_l += tmp0_l; - l_out = __msa_srari_h((v8i16)tmp1_l, 4); - r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); - q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); - ST_UB(q6, src); - - return 0; - } -} - -void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint8_t early_exit = 0; - DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); - uint8_t *filter48 = &transposed_input[16 * 16]; - - transpose_16x16((src - 8), pitch, &transposed_input[0], 16); - - early_exit = - aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); - - if (0 == early_exit) { - early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); - - if (0 == early_exit) { - transpose_16x16(transposed_input, 16, (src - 8), pitch); - } - } -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c deleted file mode 100644 index dc0a97764..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/loopfilter_msa.h" - -void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint64_t p1_d, p0_d, q0_d, q1_d; - v16u8 mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); -} - -void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0_ptr, - const uint8_t *limit0_ptr, - const uint8_t *thresh0_ptr, - const uint8_t *b_limit1_ptr, - const uint8_t *limit1_ptr, - const uint8_t *thresh1_ptr) { - v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); - thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); - thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); - - b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); - b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); - b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); - - limit0 = (v16u8)__msa_fill_b(*limit0_ptr); - limit1 = (v16u8)__msa_fill_b(*limit1_ptr); - limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, - mask, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - - ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); -} - -void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 mask, hev, flat, limit, thresh, b_limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v8i16 vec0, vec1, vec2, vec3; - - LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, - q3); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - - src -= 2; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - src += 4 * pitch; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); -} - -void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0_ptr, - const uint8_t *limit0_ptr, - const uint8_t *thresh0_ptr, - const uint8_t *b_limit1_ptr, - const uint8_t *limit1_ptr, - const uint8_t *thresh1_ptr) { - v16u8 mask, hev, flat; - v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 row0, row1, row2, row3, row4, row5, row6, row7; - v16u8 row8, row9, row10, row11, row12, row13, row14, row15; - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; - - LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); - LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, - row14, row15); - - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, - row9, row10, row11, row12, row13, row14, row15, p3, p2, - p1, p0, q0, q1, q2, q3); - - thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); - thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); - thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); - - b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); - b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); - b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); - - limit0 = (v16u8)__msa_fill_b(*limit0_ptr); - limit1 = (v16u8)__msa_fill_b(*limit1_ptr); - limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, - mask, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); - ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); - ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); - ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); - ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); - - src -= 2; - - ST4x8_UB(tmp2, tmp3, src, pitch); - src += (8 * pitch); - ST4x8_UB(tmp4, tmp5, src, pitch); -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c deleted file mode 100644 index dc203e79c..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/loopfilter_msa.h" - -void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; - v16u8 mask, hev, flat, thresh, b_limit, limit; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; - v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; - v16i8 zero = { 0 }; - - /* load vector elements */ - LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, - p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, - q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); - PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); - - p2_d = __msa_copy_u_d((v2i64)p2_out, 0); - p1_d = __msa_copy_u_d((v2i64)p1_out, 0); - p0_d = __msa_copy_u_d((v2i64)p0_out, 0); - q0_d = __msa_copy_u_d((v2i64)q0_out, 0); - q1_d = __msa_copy_u_d((v2i64)q1_out, 0); - q2_d = __msa_copy_u_d((v2i64)q2_out, 0); - - src -= 3 * pitch; - - SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); - src += (4 * pitch); - SD(q1_d, src); - src += pitch; - SD(q2_d, src); - } -} - -void aom_lpf_horizontal_8_dual_msa( - uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, - const uint8_t *thresh1) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; - v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - - /* load vector elements */ - LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - thresh = (v16u8)__msa_fill_b(*thresh0); - tmp = (v16u8)__msa_fill_b(*thresh1); - thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); - - b_limit = (v16u8)__msa_fill_b(*b_limit0); - tmp = (v16u8)__msa_fill_b(*b_limit1); - b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); - - limit = (v16u8)__msa_fill_b(*limit0); - tmp = (v16u8)__msa_fill_b(*limit1); - limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - src -= 3 * pitch; - - ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); - src += (4 * pitch); - ST_UB2(q1_out, q2_out, src, pitch); - src += (2 * pitch); - } -} - -void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p1_out, p0_out, q0_out, q1_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v16u8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4; - - /* load vector elements */ - LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); - - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, - q3); - - thresh = (v16u8)__msa_fill_b(*thresh_ptr); - b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); - limit = (v16u8)__msa_fill_b(*limit_ptr); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); - - if (__msa_test_bz_v(flat)) { - /* Store 4 pixels p1-_q1 */ - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - - src -= 2; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - src += 4 * pitch; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, - p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - /* Store 6 pixels p2-_q2 */ - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); - - src -= 3; - ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec4, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec4, 4, src + 4, pitch); - } -} - -void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0, const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *b_limit1, const uint8_t *limit1, - const uint8_t *thresh1) { - uint8_t *temp_src; - v16u8 p3, p2, p1, p0, q3, q2, q1, q0; - v16u8 p1_out, p0_out, q0_out, q1_out; - v16u8 flat, mask, hev, thresh, b_limit, limit; - v16u8 row4, row5, row6, row7, row12, row13, row14, row15; - v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; - v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; - v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; - v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; - v16u8 zero = { 0 }; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - - temp_src = src - 4; - - LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); - temp_src += (8 * pitch); - LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); - - /* transpose 16x8 matrix into 8x16 */ - TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, - row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, - q3); - - thresh = (v16u8)__msa_fill_b(*thresh0); - vec0 = (v8i16)__msa_fill_b(*thresh1); - thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); - - b_limit = (v16u8)__msa_fill_b(*b_limit0); - vec0 = (v8i16)__msa_fill_b(*b_limit1); - b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); - - limit = (v16u8)__msa_fill_b(*limit0); - vec0 = (v8i16)__msa_fill_b(*limit1); - limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); - - /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, - mask, flat); - /* flat4 */ - AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); - /* filter4 */ - AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); - - if (__msa_test_bz_v(flat)) { - ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec2, vec3); - ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec4, vec5); - - src -= 2; - ST4x8_UB(vec2, vec3, src, pitch); - src += 8 * pitch; - ST4x8_UB(vec4, vec5, src, pitch); - } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, - q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); - AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, - p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); - - ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); - ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); - - /* filter8 */ - AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, - p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); - - /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, - p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, - p0_filt8_r, q0_filt8_r); - PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, - q2_filt8_r); - - /* store pixel values */ - p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); - p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); - p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); - q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); - q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); - q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); - - ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec3, vec4); - ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); - ILVRL_H2_SH(vec1, vec0, vec6, vec7); - ILVRL_B2_SH(q2, q1, vec2, vec5); - - src -= 3; - ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec2, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec2, 4, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec5, 0, src + 4, pitch); - src += (4 * pitch); - ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); - ST2x4_UB(vec5, 4, src + 4, pitch); - } -} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c deleted file mode 100644 index 8c41278be..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask; - uint32_t hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - /* loop filter designed to work using chars so that we can make maximum use - of 8 bit simd instructions. */ - for (i = 0; i < 2; i++) { - sm1 = s - (pitch << 2); - s0 = sm1 + pitch; - s1 = s0 + pitch; - s2 = s - pitch; - s3 = s; - s4 = s + pitch; - s5 = s4 + pitch; - s6 = s5 + pitch; - - __asm__ __volatile__( - "lw %[p1], (%[s1]) \n\t" - "lw %[p2], (%[s2]) \n\t" - "lw %[p3], (%[s3]) \n\t" - "lw %[p4], (%[s4]) \n\t" - - : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* if (p1 - p4 == 0) and (p2 - p3 == 0) - mask will be zero and filtering is not needed */ - if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - __asm__ __volatile__( - "lw %[pm1], (%[sm1]) \n\t" - "lw %[p0], (%[s0]) \n\t" - "lw %[p5], (%[s5]) \n\t" - "lw %[p6], (%[s6]) \n\t" - - : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) - : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); - - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, - p6, thresh_vec, &hev, &mask); - - /* if mask == 0 do filtering is not needed */ - if (mask) { - /* filtering */ - filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); - - __asm__ __volatile__( - "sw %[p1], (%[s1]) \n\t" - "sw %[p2], (%[s2]) \n\t" - "sw %[p3], (%[s3]) \n\t" - "sw %[p4], (%[s4]) \n\t" - - : - : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), - [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - } - } - - s = s + 4; - } -} - -void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - /* load quad-byte vectors - * memory is 4 byte aligned - */ - p2 = *((uint32_t *)(s1 - 4)); - p6 = *((uint32_t *)(s1)); - p1 = *((uint32_t *)(s2 - 4)); - p5 = *((uint32_t *)(s2)); - p0 = *((uint32_t *)(s3 - 4)); - p4 = *((uint32_t *)(s3)); - pm1 = *((uint32_t *)(s4 - 4)); - p3 = *((uint32_t *)(s4)); - - /* transpose pm1, p0, p1, p2 */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" - "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[pm1], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), - [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose p3, p4, p5, p6 */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" - "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" - "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" - - "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" - "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" - "append %[p5], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* if (p1 - p4 == 0) and (p2 - p3 == 0) - * mask will be zero and filtering is not needed - */ - if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, - p6, thresh_vec, &hev, &mask); - - /* if mask == 0 do filtering is not needed */ - if (mask) { - /* filtering */ - filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); - - /* unpack processed 4x4 neighborhood - * don't use transpose on output data - * because memory isn't aligned - */ - __asm__ __volatile__( - "sb %[p4], 1(%[s4]) \n\t" - "sb %[p3], 0(%[s4]) \n\t" - "sb %[p2], -1(%[s4]) \n\t" - "sb %[p1], -2(%[s4]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s4] "r"(s4)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s3]) \n\t" - "sb %[p3], 0(%[s3]) \n\t" - "sb %[p2], -1(%[s3]) \n\t" - "sb %[p1], -2(%[s3]) \n\t" - - : [p1] "+r"(p1) - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s2]) \n\t" - "sb %[p3], 0(%[s2]) \n\t" - "sb %[p2], -1(%[s2]) \n\t" - "sb %[p1], -2(%[s2]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s2] "r"(s2)); - - __asm__ __volatile__( - "srl %[p4], %[p4], 8 \n\t" - "srl %[p3], %[p3], 8 \n\t" - "srl %[p2], %[p2], 8 \n\t" - "srl %[p1], %[p1], 8 \n\t" - - : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) - :); - - __asm__ __volatile__( - "sb %[p4], 1(%[s1]) \n\t" - "sb %[p3], 0(%[s1]) \n\t" - "sb %[p2], -1(%[s1]) \n\t" - "sb %[p1], -2(%[s1]) \n\t" - - : - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [s1] "r"(s1)); - } - } - } -} - -void aom_lpf_horizontal_4_dual_dspr2( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); -} - -void aom_lpf_horizontal_8_dual_dspr2( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); -} - -void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); - aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h deleted file mode 100644 index 28f0dc35a..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ /dev/null @@ -1,736 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* inputs & outputs are quad-byte vectors */ -static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, - uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { - int32_t aom_filter_l, aom_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; - - N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; - HWM = 0xFF00FF00; - - vps0 = (*ps0) ^ N128; - vps1 = (*ps1) ^ N128; - vqs0 = (*qs0) ^ N128; - vqs1 = (*qs1) ^ N128; - - /* use halfword pairs instead quad-bytes because of accuracy */ - vps0_l = vps0 & HWM; - vps0_r = vps0 << 8; - vps0_r = vps0_r & HWM; - - vps1_l = vps1 & HWM; - vps1_r = vps1 << 8; - vps1_r = vps1_r & HWM; - - vqs0_l = vqs0 & HWM; - vqs0_r = vqs0 << 8; - vqs0_r = vqs0_r & HWM; - - vqs1_l = vqs1 & HWM; - vqs1_r = vqs1 << 8; - vqs1_r = vqs1_r & HWM; - - mask_l = mask & HWM; - mask_r = mask << 8; - mask_r = mask_r & HWM; - - hev_l = hev & HWM; - hev_r = hev << 8; - hev_r = hev_r & HWM; - - __asm__ __volatile__( - /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ - "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" - "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" - - /* qs0 - ps0 */ - "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" - "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" - - /* aom_filter &= hev; */ - "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" - - /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_l], %[hev_l], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_r], %[hev_r], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - - /* aom_filter &= mask; */ - "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" - - : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), - [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), - [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) - : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), - [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), - [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), - [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), - [HWM] "r"(HWM)); - - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__( - /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ - "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" - "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" - - /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ - "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" - "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" - "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" - "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" - - "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" - "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" - - "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" - "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" - - /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ - "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" - "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" - - /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ - "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - - : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), - [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), - [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), - [vqs0_r] "+r"(vqs0_r) - : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), - [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); - - __asm__ __volatile__( - /* (aom_filter += 1) >>= 1 */ - "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" - "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" - - /* aom_filter &= ~hev; */ - "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" - "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" - - /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ - "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" - "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" - - /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ - "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - - : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), - [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), - [vqs1_r] "+r"(vqs1_r) - : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); - - /* Create quad-bytes from halfword pairs */ - vqs0_l = vqs0_l & HWM; - vqs1_l = vqs1_l & HWM; - vps0_l = vps0_l & HWM; - vps1_l = vps1_l & HWM; - - __asm__ __volatile__( - "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" - "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" - "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" - "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - - : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), - [vqs0_r] "+r"(vqs0_r) - :); - - vqs0 = vqs0_l | vqs0_r; - vqs1 = vqs1_l | vqs1_r; - vps0 = vps0_l | vps0_r; - vps1 = vps1_l | vps1_r; - - *ps0 = vps0 ^ N128; - *ps1 = vps1 ^ N128; - *qs0 = vqs0 ^ N128; - *qs1 = vqs1 ^ N128; -} - -static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, - uint32_t ps0, uint32_t qs0, uint32_t qs1, - uint32_t *p1_f0, uint32_t *p0_f0, - uint32_t *q0_f0, uint32_t *q1_f0) { - int32_t aom_filter_l, aom_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; - - N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; - HWM = 0xFF00FF00; - - vps0 = (ps0) ^ N128; - vps1 = (ps1) ^ N128; - vqs0 = (qs0) ^ N128; - vqs1 = (qs1) ^ N128; - - /* use halfword pairs instead quad-bytes because of accuracy */ - vps0_l = vps0 & HWM; - vps0_r = vps0 << 8; - vps0_r = vps0_r & HWM; - - vps1_l = vps1 & HWM; - vps1_r = vps1 << 8; - vps1_r = vps1_r & HWM; - - vqs0_l = vqs0 & HWM; - vqs0_r = vqs0 << 8; - vqs0_r = vqs0_r & HWM; - - vqs1_l = vqs1 & HWM; - vqs1_r = vqs1 << 8; - vqs1_r = vqs1_r & HWM; - - mask_l = mask & HWM; - mask_r = mask << 8; - mask_r = mask_r & HWM; - - hev_l = hev & HWM; - hev_r = hev << 8; - hev_r = hev_r & HWM; - - __asm__ __volatile__( - /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ - "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" - "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" - - /* qs0 - ps0 */ - "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" - "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" - - /* aom_filter &= hev; */ - "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" - - /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_l], %[hev_l], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - "xor %[invhev_r], %[hev_r], %[HWM] \n\t" - "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" - "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" - - /* aom_filter &= mask; */ - "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" - "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" - - : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), - [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), - [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) - : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), - [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), - [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), - [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), - [HWM] "r"(HWM)); - - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__( - /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ - "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" - "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" - - /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ - "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" - "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" - "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" - "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" - - "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" - "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" - - "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" - "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" - - /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ - "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" - "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" - - /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ - "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - - : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), - [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), - [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), - [vqs0_r] "+r"(vqs0_r) - : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), - [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); - - __asm__ __volatile__( - /* (aom_filter += 1) >>= 1 */ - "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" - "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" - - /* aom_filter &= ~hev; */ - "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" - "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" - - /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ - "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" - "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" - - /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ - "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" - "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - - : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), - [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), - [vqs1_r] "+r"(vqs1_r) - : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); - - /* Create quad-bytes from halfword pairs */ - vqs0_l = vqs0_l & HWM; - vqs1_l = vqs1_l & HWM; - vps0_l = vps0_l & HWM; - vps1_l = vps1_l & HWM; - - __asm__ __volatile__( - "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" - "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" - "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" - "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - - : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), - [vqs0_r] "+r"(vqs0_r) - :); - - vqs0 = vqs0_l | vqs0_r; - vqs1 = vqs1_l | vqs1_r; - vps0 = vps0_l | vps0_r; - vps1 = vps1_l | vps1_r; - - *p0_f0 = vps0 ^ N128; - *p1_f0 = vps1 ^ N128; - *q0_f0 = vqs0 ^ N128; - *q1_f0 = vqs1 ^ N128; -} - -static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, - uint32_t *op0, uint32_t *oq0, uint32_t *oq1, - uint32_t *oq2, uint32_t *oq3) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; - - /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ - /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ - /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ - /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ - /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ - /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - - __asm__ __volatile__( - "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" - - "shll.ph %[tmp], %[p3], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p3] \n\t" - "addu.ph %[res_op1], %[p3], %[p3] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" - "shrl.ph %[res_op1], %[res_op1], 3 \n\t" - "shrl.ph %[res_op2], %[res_op2], 3 \n\t" - "addu.ph %[res_op0], %[p3], %[p0] \n\t" - "addu.ph %[res_oq0], %[q0], %[q3] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq1], %[q3], %[q3] \n\t" - "shll.ph %[tmp], %[q3], 1 \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" - "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" - "shrl.ph %[res_op0], %[res_op0], 3 \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - - : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), - [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), - [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); - - *op2 = res_op2; - *op1 = res_op1; - *op0 = res_op0; - *oq0 = res_oq0; - *oq1 = res_oq1; - *oq2 = res_oq2; -} - -static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, - uint32_t p0, uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, uint32_t *op2_f1, - uint32_t *op1_f1, uint32_t *op0_f1, - uint32_t *oq0_f1, uint32_t *oq1_f1, - uint32_t *oq2_f1) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; - - /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ - /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ - /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ - /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ - /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ - /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - - __asm__ __volatile__( - "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" - "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" - - "shll.ph %[tmp], %[p3], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p3] \n\t" - "addu.ph %[res_op1], %[p3], %[p3] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" - "shrl.ph %[res_op1], %[res_op1], 3 \n\t" - "shrl.ph %[res_op2], %[res_op2], 3 \n\t" - "addu.ph %[res_op0], %[p3], %[p0] \n\t" - "addu.ph %[res_oq0], %[q0], %[q3] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" - "addu.ph %[res_oq1], %[q3], %[q3] \n\t" - "shll.ph %[tmp], %[q3], 1 \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" - "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" - "shrl.ph %[res_op0], %[res_op0], 3 \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - - : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), - [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), - [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); - - *op2_f1 = res_op2; - *op1_f1 = res_op1; - *op0_f1 = res_op0; - *oq0_f1 = res_oq0; - *oq1_f1 = res_oq1; - *oq2_f1 = res_oq2; -} - -static INLINE void wide_mbfilter_dspr2( - uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, - uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, - uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, - uint32_t *oq7) { - const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; - const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; - uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; - uint32_t tmp; - uint32_t add_p6toq6; - uint32_t u32Eight = 0x00080008; - - __asm__ __volatile__( - /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 - which is used most of the time */ - "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" - "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" - - : [add_p6toq6] "=&r"(add_p6toq6) - : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), - [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), - [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), - [u32Eight] "r"(u32Eight)); - - __asm__ __volatile__( - /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + - p3 + p2 + p1 + p0 + q0, 4) */ - "shll.ph %[tmp], %[p7], 3 \n\t" - "subu.ph %[res_op6], %[tmp], %[p7] \n\t" - "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" - "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" - "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" - "shrl.ph %[res_op6], %[res_op6], 4 \n\t" - - /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + - p2 + p1 + p0 + q0 + q1, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op5], %[tmp], %[p7] \n\t" - "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" - "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" - "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" - "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" - "shrl.ph %[res_op5], %[res_op5], 4 \n\t" - - /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + - p1 + p0 + q0 + q1 + q2, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op4], %[tmp], %[p7] \n\t" - "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" - "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" - "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" - "shrl.ph %[res_op4], %[res_op4], 4 \n\t" - - /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + - p1 + p0 + q0 + q1 + q2 + q3, 4) */ - "shll.ph %[tmp], %[p7], 2 \n\t" - "addu.ph %[res_op3], %[tmp], %[p3] \n\t" - "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" - "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" - "shrl.ph %[res_op3], %[res_op3], 4 \n\t" - - /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + - p0 + q0 + q1 + q2 + q3 + q4, 4) */ - "shll.ph %[tmp], %[p7], 1 \n\t" - "addu.ph %[res_op2], %[tmp], %[p7] \n\t" - "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" - "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" - "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" - "shrl.ph %[res_op2], %[res_op2], 4 \n\t" - - /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + - p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ - "shll.ph %[tmp], %[p7], 1 \n\t" - "addu.ph %[res_op1], %[tmp], %[p1] \n\t" - "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" - "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" - "shrl.ph %[res_op1], %[res_op1], 4 \n\t" - - /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + - q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ - "addu.ph %[res_op0], %[p7], %[p0] \n\t" - "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" - "shrl.ph %[res_op0], %[res_op0], 4 \n\t" - - : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), - [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), - [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), - [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), - [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), - [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), - [add_p6toq6] "r"(add_p6toq6)); - - *op6 = res_op6; - *op5 = res_op5; - *op4 = res_op4; - *op3 = res_op3; - *op2 = res_op2; - *op1 = res_op1; - *op0 = res_op0; - - __asm__ __volatile__( - /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + - q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ - "addu.ph %[res_oq0], %[q7], %[q0] \n\t" - "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" - "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" - - /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + - q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ - "shll.ph %[tmp], %[q7], 1 \n\t" - "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" - "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" - "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" - "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" - - /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + - q3 + q4 + q5 + q6 + q7 * 3, 4) */ - "shll.ph %[tmp], %[q7], 1 \n\t" - "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" - "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" - "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" - "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" - - /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + - q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" - "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" - "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" - "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" - - /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + - q4 * 2 + q5 + q6 + q7 * 5, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" - "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" - "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" - "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" - - /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + - q5 * 2 + q6 + q7 * 6, 4) */ - "shll.ph %[tmp], %[q7], 2 \n\t" - "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" - "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" - "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" - "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" - - /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + - q4 + q5 + q6 * 2 + q7 * 7, 4) */ - "shll.ph %[tmp], %[q7], 3 \n\t" - "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" - "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" - "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" - "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" - "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" - - : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), - [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), - [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), - [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) - : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), - [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), - [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), - [add_p6toq6] "r"(add_p6toq6)); - - *oq0 = res_oq0; - *oq1 = res_oq1; - *oq2 = res_oq2; - *oq3 = res_oq3; - *oq4 = res_oq4; - *oq5 = res_oq5; - *oq6 = res_oq6; -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h deleted file mode 100644 index 62295d69d..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ /dev/null @@ -1,437 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -#define STORE_F0() \ - { \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s4]) \n\t" \ - "sb %[q0_f0], 0(%[s4]) \n\t" \ - "sb %[p0_f0], -1(%[s4]) \n\t" \ - "sb %[p1_f0], -2(%[s4]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s3]) \n\t" \ - "sb %[q0_f0], 0(%[s3]) \n\t" \ - "sb %[p0_f0], -1(%[s3]) \n\t" \ - "sb %[p1_f0], -2(%[s3]) \n\t" \ - \ - : [p1_f0] "+r"(p1_f0) \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ - [p0_f0] "r"(p0_f0)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s2]) \n\t" \ - "sb %[q0_f0], 0(%[s2]) \n\t" \ - "sb %[p0_f0], -1(%[s2]) \n\t" \ - "sb %[p1_f0], -2(%[s2]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ - [p1_f0] "+r"(p1_f0) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q1_f0], 1(%[s1]) \n\t" \ - "sb %[q0_f0], 0(%[s1]) \n\t" \ - "sb %[p0_f0], -1(%[s1]) \n\t" \ - "sb %[p1_f0], -2(%[s1]) \n\t" \ - \ - : \ - : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ - [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ - } - -#define STORE_F1() \ - { \ - __asm__ __volatile__( \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - \ - : \ - : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ - [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - \ - : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ - [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - \ - : \ - : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ - [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ - \ - __asm__ __volatile__( \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - \ - : \ - : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ - [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - \ - : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ - [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - \ - : \ - : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ - [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ - } - -#define STORE_F2() \ - { \ - __asm__ __volatile__( \ - "sb %[q6_r], 6(%[s4]) \n\t" \ - "sb %[q5_r], 5(%[s4]) \n\t" \ - "sb %[q4_r], 4(%[s4]) \n\t" \ - "sb %[q3_r], 3(%[s4]) \n\t" \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - "sb %[p3_r], -4(%[s4]) \n\t" \ - "sb %[p4_r], -5(%[s4]) \n\t" \ - "sb %[p5_r], -6(%[s4]) \n\t" \ - "sb %[p6_r], -7(%[s4]) \n\t" \ - \ - : \ - : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ - [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ - [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ - [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ - [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ - \ - __asm__ __volatile__( \ - "srl %[q6_r], %[q6_r], 16 \n\t" \ - "srl %[q5_r], %[q5_r], 16 \n\t" \ - "srl %[q4_r], %[q4_r], 16 \n\t" \ - "srl %[q3_r], %[q3_r], 16 \n\t" \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - "srl %[p3_r], %[p3_r], 16 \n\t" \ - "srl %[p4_r], %[p4_r], 16 \n\t" \ - "srl %[p5_r], %[p5_r], 16 \n\t" \ - "srl %[p6_r], %[p6_r], 16 \n\t" \ - \ - : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ - [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ - [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ - [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ - [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q6_r], 6(%[s3]) \n\t" \ - "sb %[q5_r], 5(%[s3]) \n\t" \ - "sb %[q4_r], 4(%[s3]) \n\t" \ - "sb %[q3_r], 3(%[s3]) \n\t" \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - "sb %[p3_r], -4(%[s3]) \n\t" \ - "sb %[p4_r], -5(%[s3]) \n\t" \ - "sb %[p5_r], -6(%[s3]) \n\t" \ - "sb %[p6_r], -7(%[s3]) \n\t" \ - \ - : \ - : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ - [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ - [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ - [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ - [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ - \ - __asm__ __volatile__( \ - "sb %[q6_l], 6(%[s2]) \n\t" \ - "sb %[q5_l], 5(%[s2]) \n\t" \ - "sb %[q4_l], 4(%[s2]) \n\t" \ - "sb %[q3_l], 3(%[s2]) \n\t" \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - "sb %[p3_l], -4(%[s2]) \n\t" \ - "sb %[p4_l], -5(%[s2]) \n\t" \ - "sb %[p5_l], -6(%[s2]) \n\t" \ - "sb %[p6_l], -7(%[s2]) \n\t" \ - \ - : \ - : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ - [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ - [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ - [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ - [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ - \ - __asm__ __volatile__( \ - "srl %[q6_l], %[q6_l], 16 \n\t" \ - "srl %[q5_l], %[q5_l], 16 \n\t" \ - "srl %[q4_l], %[q4_l], 16 \n\t" \ - "srl %[q3_l], %[q3_l], 16 \n\t" \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - "srl %[p3_l], %[p3_l], 16 \n\t" \ - "srl %[p4_l], %[p4_l], 16 \n\t" \ - "srl %[p5_l], %[p5_l], 16 \n\t" \ - "srl %[p6_l], %[p6_l], 16 \n\t" \ - \ - : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ - [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ - [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ - [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ - [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ - :); \ - \ - __asm__ __volatile__( \ - "sb %[q6_l], 6(%[s1]) \n\t" \ - "sb %[q5_l], 5(%[s1]) \n\t" \ - "sb %[q4_l], 4(%[s1]) \n\t" \ - "sb %[q3_l], 3(%[s1]) \n\t" \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - "sb %[p3_l], -4(%[s1]) \n\t" \ - "sb %[p4_l], -5(%[s1]) \n\t" \ - "sb %[p5_l], -6(%[s1]) \n\t" \ - "sb %[p6_l], -7(%[s1]) \n\t" \ - \ - : \ - : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ - [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ - [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ - [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ - [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ - } - -#define PACK_LEFT_0TO3() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ - "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ - "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ - "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ - "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ - "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ - "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ - "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ - \ - : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ - [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ - [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ - : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ - } - -#define PACK_LEFT_4TO7() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ - "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ - "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ - "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ - "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ - "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ - "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ - "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ - \ - : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ - [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ - [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ - [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ - } - -#define PACK_RIGHT_0TO3() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ - "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ - "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ - "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ - "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ - "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ - "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ - "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ - \ - : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ - [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ - [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ - : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ - } - -#define PACK_RIGHT_4TO7() \ - { \ - __asm__ __volatile__( \ - "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ - "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ - "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ - "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ - "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ - "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ - "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ - "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ - \ - : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ - [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ - [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ - : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ - [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ - } - -#define COMBINE_LEFT_RIGHT_0TO2() \ - { \ - __asm__ __volatile__( \ - "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ - "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ - "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ - "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ - "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ - "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ - \ - : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ - [q1] "=&r"(q1), [q2] "=&r"(q2) \ - : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ - [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ - [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ - [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ - } - -#define COMBINE_LEFT_RIGHT_3TO6() \ - { \ - __asm__ __volatile__( \ - "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ - "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ - "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ - "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ - "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ - "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ - "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ - "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ - \ - : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ - [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ - [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ - [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ - [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ - [q6_r] "r"(q6_r)); \ - } - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h deleted file mode 100644 index a0f57f386..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_mem/aom_mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* processing 4 pixels at the same time - * compute hev and mask in the same function */ -static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, - uint32_t p1, uint32_t p0, uint32_t p3, - uint32_t p2, uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, - uint32_t thresh, uint32_t *hev, - uint32_t *mask) { - uint32_t c, r, r3, r_k; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t hev1; - - __asm__ __volatile__( - /* mask |= (abs(p3 - p2) > limit) */ - "subu_s.qb %[c], %[p3], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* mask |= (abs(p2 - p1) > limit) */ - "subu_s.qb %[c], %[p2], %[p1] \n\t" - "subu_s.qb %[r_k], %[p1], %[p2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(p1 - p0) > limit) - * hev |= (abs(p1 - p0) > thresh) - */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], $0, %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(q1 - q0) > limit) - * hev |= (abs(q1 - q0) > thresh) - */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], %[r3], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(q2 - q1) > limit) */ - "subu_s.qb %[c], %[q2], %[q1] \n\t" - "subu_s.qb %[r_k], %[q1], %[q2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r3], %[r3], 24 \n\t" - - /* mask |= (abs(q3 - q2) > limit) */ - "subu_s.qb %[c], %[q3], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) - : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), - [thresh] "r"(thresh)); - - __asm__ __volatile__( - /* abs(p0 - q0) */ - "subu_s.qb %[c], %[p0], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[p0] \n\t" - "wrdsp %[r3] \n\t" - "or %[s1], %[r_k], %[c] \n\t" - - /* abs(p1 - q1) */ - "subu_s.qb %[c], %[p1], %[q1] \n\t" - "addu_s.qb %[s3], %[s1], %[s1] \n\t" - "pick.qb %[hev1], %[ones], $0 \n\t" - "subu_s.qb %[r_k], %[q1], %[p1] \n\t" - "or %[s2], %[r_k], %[c] \n\t" - - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ - "shrl.qb %[s2], %[s2], 1 \n\t" - "addu_s.qb %[s1], %[s2], %[s3] \n\t" - "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - - "wrdsp %[r] \n\t" - "pick.qb %[s2], $0, %[ones] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), - [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), - [ones] "r"(ones), [flimit] "r"(flimit)); - - *hev = hev1; - *mask = s2; -} - -static INLINE void filter_hev_mask_flatmask4_dspr2( - uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, - uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, - uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { - uint32_t c, r, r3, r_k, r_flat; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t hev1; - uint32_t flat1; - - __asm__ __volatile__( - /* mask |= (abs(p3 - p2) > limit) */ - "subu_s.qb %[c], %[p3], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* mask |= (abs(p2 - p1) > limit) */ - "subu_s.qb %[c], %[p2], %[p1] \n\t" - "subu_s.qb %[r_k], %[p1], %[p2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - /* mask |= (abs(p1 - p0) > limit) - * hev |= (abs(p1 - p0) > thresh) - * flat |= (abs(p1 - p0) > thresh) - */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], $0, %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], $0, %[c] \n\t" - - /* mask |= (abs(q1 - q0) > limit) - * hev |= (abs(q1 - q0) > thresh) - * flat |= (abs(q1 - q0) > thresh) - */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" - "or %[r3], %[r3], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p0 - p2) > thresh) */ - "subu_s.qb %[c], %[p0], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q0 - q2) > thresh) */ - "subu_s.qb %[c], %[q0], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p3 - p0) > thresh) */ - "subu_s.qb %[c], %[p3], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q3 - q0) > thresh) */ - "subu_s.qb %[c], %[q3], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - "sll %[r_flat], %[r_flat], 24 \n\t" - /* look at stall here */ - "wrdsp %[r_flat] \n\t" - "pick.qb %[flat1], $0, %[ones] \n\t" - - /* mask |= (abs(q2 - q1) > limit) */ - "subu_s.qb %[c], %[q2], %[q1] \n\t" - "subu_s.qb %[r_k], %[q1], %[q2] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r3], %[r3], 24 \n\t" - - /* mask |= (abs(q3 - q2) > limit) */ - "subu_s.qb %[c], %[q3], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), - [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) - : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), - [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), - [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); - - __asm__ __volatile__( - /* abs(p0 - q0) */ - "subu_s.qb %[c], %[p0], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[p0] \n\t" - "wrdsp %[r3] \n\t" - "or %[s1], %[r_k], %[c] \n\t" - - /* abs(p1 - q1) */ - "subu_s.qb %[c], %[p1], %[q1] \n\t" - "addu_s.qb %[s3], %[s1], %[s1] \n\t" - "pick.qb %[hev1], %[ones], $0 \n\t" - "subu_s.qb %[r_k], %[q1], %[p1] \n\t" - "or %[s2], %[r_k], %[c] \n\t" - - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ - "shrl.qb %[s2], %[s2], 1 \n\t" - "addu_s.qb %[s1], %[s2], %[s3] \n\t" - "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - - "wrdsp %[r] \n\t" - "pick.qb %[s2], $0, %[ones] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), - [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) - : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), - [ones] "r"(ones), [flimit] "r"(flimit)); - - *hev = hev1; - *mask = s2; - *flat = flat1; -} - -static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, - uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, - uint32_t q3, uint32_t q4, uint32_t *flat2) { - uint32_t c, r, r_k, r_flat; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t flat1, flat3; - - __asm__ __volatile__( - /* flat |= (abs(p4 - p0) > thresh) */ - "subu_s.qb %[c], %[p4], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p4] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r], $0, %[c] \n\t" - - /* flat |= (abs(q4 - q0) > thresh) */ - "subu_s.qb %[c], %[q4], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q4] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r], %[r], %[c] \n\t" - "sll %[r], %[r], 24 \n\t" - "wrdsp %[r] \n\t" - "pick.qb %[flat3], $0, %[ones] \n\t" - - /* flat |= (abs(p1 - p0) > thresh) */ - "subu_s.qb %[c], %[p1], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], $0, %[c] \n\t" - - /* flat |= (abs(q1 - q0) > thresh) */ - "subu_s.qb %[c], %[q1], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q1] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p0 - p2) > thresh) */ - "subu_s.qb %[c], %[p0], %[p2] \n\t" - "subu_s.qb %[r_k], %[p2], %[p0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q0 - q2) > thresh) */ - "subu_s.qb %[c], %[q0], %[q2] \n\t" - "subu_s.qb %[r_k], %[q2], %[q0] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(p3 - p0) > thresh) */ - "subu_s.qb %[c], %[p3], %[p0] \n\t" - "subu_s.qb %[r_k], %[p0], %[p3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - - /* flat |= (abs(q3 - q0) > thresh) */ - "subu_s.qb %[c], %[q3], %[q0] \n\t" - "subu_s.qb %[r_k], %[q0], %[q3] \n\t" - "or %[r_k], %[r_k], %[c] \n\t" - "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" - "or %[r_flat], %[r_flat], %[c] \n\t" - "sll %[r_flat], %[r_flat], 24 \n\t" - "wrdsp %[r_flat] \n\t" - "pick.qb %[flat1], $0, %[ones] \n\t" - /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ - "and %[flat1], %[flat3], %[flat1] \n\t" - - : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), - [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) - : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), - [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), - [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); - - *flat2 = flat1; -} -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c deleted file mode 100644 index b67ccfe9d..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c +++ /dev/null @@ -1,590 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint32_t mask; - uint32_t hev, flat; - uint8_t i; - uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p3, p2, p1, p0, q0, q1, q2, q3; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - for (i = 0; i < 2; i++) { - sp3 = s - (pitch << 2); - sp2 = sp3 + pitch; - sp1 = sp2 + pitch; - sp0 = sp1 + pitch; - sq0 = s; - sq1 = s + pitch; - sq2 = sq1 + pitch; - sq3 = sq2 + pitch; - - __asm__ __volatile__( - "lw %[p3], (%[sp3]) \n\t" - "lw %[p2], (%[sp2]) \n\t" - "lw %[p1], (%[sp1]) \n\t" - "lw %[p0], (%[sp0]) \n\t" - "lw %[q0], (%[sq0]) \n\t" - "lw %[q1], (%[sq1]) \n\t" - "lw %[q2], (%[sq2]) \n\t" - "lw %[q3], (%[sq3]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) - : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - __asm__ __volatile__( - "sw %[p1_f0], (%[sp1]) \n\t" - "sw %[p0_f0], (%[sp0]) \n\t" - "sw %[q0_f0], (%[sq0]) \n\t" - "sw %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1)); - } else if ((mask & flat) == 0xFFFFFFFF) { - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - COMBINE_LEFT_RIGHT_0TO2() - - __asm__ __volatile__( - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - - : - : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), - [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), - [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if ((flat != 0) && (mask != 0)) { - /* filtering */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), - [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), - [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } - - s = s + 4; - } -} - -void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p3, p2, p1, p0, q3, q2, q1, q0; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - __asm__ __volatile__( - "lw %[p0], -4(%[s1]) \n\t" - "lw %[p1], -4(%[s2]) \n\t" - "lw %[p2], -4(%[s3]) \n\t" - "lw %[p3], -4(%[s4]) \n\t" - "lw %[q3], (%[s1]) \n\t" - "lw %[q2], (%[s2]) \n\t" - "lw %[q1], (%[s3]) \n\t" - "lw %[q0], (%[s4]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* transpose p3, p2, p1, p0 - original (when loaded from memory) - register -4 -3 -2 -1 - p0 p0_0 p0_1 p0_2 p0_3 - p1 p1_0 p1_1 p1_2 p1_3 - p2 p2_0 p2_1 p2_2 p2_3 - p3 p3_0 p3_1 p3_2 p3_3 - - after transpose - register - p0 p3_3 p2_3 p1_3 p0_3 - p1 p3_2 p2_2 p1_2 p0_2 - p2 p3_1 p2_1 p1_1 p0_1 - p3 p3_0 p2_0 p1_0 p0_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q0, q1, q2, q3 - original (when loaded from memory) - register +1 +2 +3 +4 - q3 q3_0 q3_1 q3_2 q3_3 - q2 q2_0 q2_1 q2_2 q2_3 - q1 q1_0 q1_1 q1_2 q1_3 - q0 q0_0 q0_1 q0_2 q0_3 - - after transpose - register - q3 q0_3 q1_3 q2_3 q3_3 - q2 q0_2 q1_2 q2_2 q3_2 - q1 q0_1 q1_1 q2_1 q3_1 - q0 q0_0 q1_0 q2_0 q3_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" - "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" - "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" - "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" - - "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" - "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" - "append %[q2], %[sec3], 16 \n\t" - "append %[q0], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), - [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - STORE_F0() - } else if ((mask & flat) == 0xFFFFFFFF) { - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - STORE_F1() - } else if ((flat != 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), - [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), - [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], +1(%[s1]) \n\t" - "sb %[q2_l], +2(%[s1]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c deleted file mode 100644 index 34733e42e..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ /dev/null @@ -1,734 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, int count) { - uint32_t mask; - uint32_t hev, flat, flat2; - uint8_t i; - uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; - uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - /* prefetch data for store */ - prefetch_store(s); - - for (i = 0; i < (2 * count); i++) { - sp7 = s - (pitch << 3); - sp6 = sp7 + pitch; - sp5 = sp6 + pitch; - sp4 = sp5 + pitch; - sp3 = sp4 + pitch; - sp2 = sp3 + pitch; - sp1 = sp2 + pitch; - sp0 = sp1 + pitch; - sq0 = s; - sq1 = s + pitch; - sq2 = sq1 + pitch; - sq3 = sq2 + pitch; - sq4 = sq3 + pitch; - sq5 = sq4 + pitch; - sq6 = sq5 + pitch; - sq7 = sq6 + pitch; - - __asm__ __volatile__( - "lw %[p7], (%[sp7]) \n\t" - "lw %[p6], (%[sp6]) \n\t" - "lw %[p5], (%[sp5]) \n\t" - "lw %[p4], (%[sp4]) \n\t" - "lw %[p3], (%[sp3]) \n\t" - "lw %[p2], (%[sp2]) \n\t" - "lw %[p1], (%[sp1]) \n\t" - "lw %[p0], (%[sp0]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) - : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); - - __asm__ __volatile__( - "lw %[q0], (%[sq0]) \n\t" - "lw %[q1], (%[sq1]) \n\t" - "lw %[q2], (%[sq2]) \n\t" - "lw %[q3], (%[sq3]) \n\t" - "lw %[q4], (%[sq4]) \n\t" - "lw %[q5], (%[sq5]) \n\t" - "lw %[q6], (%[sq6]) \n\t" - "lw %[q7], (%[sq7]) \n\t" - - : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), - [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) - : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), - [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); - - /* f0 */ - if (((flat2 == 0) && (flat == 0) && (mask != 0)) || - ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - __asm__ __volatile__( - "sw %[p1_f0], (%[sp1]) \n\t" - "sw %[p0_f0], (%[sp0]) \n\t" - "sw %[q0_f0], (%[sq0]) \n\t" - "sw %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1)); - } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && - (mask == 0xFFFFFFFF)) { - /* f2 */ - PACK_LEFT_0TO3() - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_0TO3() - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - COMBINE_LEFT_RIGHT_0TO2() - COMBINE_LEFT_RIGHT_3TO6() - - __asm__ __volatile__( - "sw %[p6], (%[sp6]) \n\t" - "sw %[p5], (%[sp5]) \n\t" - "sw %[p4], (%[sp4]) \n\t" - "sw %[p3], (%[sp3]) \n\t" - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - - : - : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), - [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), - [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sw %[q6], (%[sq6]) \n\t" - "sw %[q5], (%[sq5]) \n\t" - "sw %[q4], (%[sq4]) \n\t" - "sw %[q3], (%[sq3]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - - : - : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), - [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), - [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), - [sq1] "r"(sq1), [sq0] "r"(sq0)); - } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { - /* f1 */ - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - COMBINE_LEFT_RIGHT_0TO2() - - __asm__ __volatile__( - "sw %[p2], (%[sp2]) \n\t" - "sw %[p1], (%[sp1]) \n\t" - "sw %[p0], (%[sp0]) \n\t" - "sw %[q0], (%[sq0]) \n\t" - "sw %[q1], (%[sq1]) \n\t" - "sw %[q2], (%[sq2]) \n\t" - - : - : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), - [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), - [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { - /* f0+f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), - [sq1] "r"(sq1), [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { - /* f0 + f1 + f2 */ - /* f0 function */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* f1 function */ - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, - &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, - &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); - - /* f2 function */ - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__( - "sb %[p6_r], (%[sp6]) \n\t" - "sb %[p5_r], (%[sp5]) \n\t" - "sb %[p4_r], (%[sp4]) \n\t" - "sb %[p3_r], (%[sp3]) \n\t" - "sb %[p2_r], (%[sp2]) \n\t" - "sb %[p1_r], (%[sp1]) \n\t" - "sb %[p0_r], (%[sp0]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), - [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_r], (%[sq0]) \n\t" - "sb %[q1_r], (%[sq1]) \n\t" - "sb %[q2_r], (%[sq2]) \n\t" - "sb %[q3_r], (%[sq3]) \n\t" - "sb %[q4_r], (%[sq4]) \n\t" - "sb %[q5_r], (%[sq5]) \n\t" - "sb %[q6_r], (%[sq6]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r_f1], (%[sp2]) \n\t" - "sb %[p1_r_f1], (%[sp1]) \n\t" - "sb %[p0_r_f1], (%[sp0]) \n\t" - "sb %[q0_r_f1], (%[sq0]) \n\t" - "sb %[q1_r_f1], (%[sq1]) \n\t" - "sb %[q2_r_f1], (%[sq2]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], (%[sp1]) \n\t" - "sb %[p0_f0], (%[sp0]) \n\t" - "sb %[q0_f0], (%[sq0]) \n\t" - "sb %[q1_f0], (%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p6_r], %[p6_r], 16 \n\t" - "srl %[p5_r], %[p5_r], 16 \n\t" - "srl %[p4_r], %[p4_r], 16 \n\t" - "srl %[p3_r], %[p3_r], 16 \n\t" - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[q3_r], %[q3_r], 16 \n\t" - "srl %[q4_r], %[q4_r], 16 \n\t" - "srl %[q5_r], %[q5_r], 16 \n\t" - "srl %[q6_r], %[q6_r], 16 \n\t" - - : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), - [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), - [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), - [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) - :); - - __asm__ __volatile__( - "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" - "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" - "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" - "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" - "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" - "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), - [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), - [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p6_r], +1(%[sp6]) \n\t" - "sb %[p5_r], +1(%[sp5]) \n\t" - "sb %[p4_r], +1(%[sp4]) \n\t" - "sb %[p3_r], +1(%[sp3]) \n\t" - "sb %[p2_r], +1(%[sp2]) \n\t" - "sb %[p1_r], +1(%[sp1]) \n\t" - "sb %[p0_r], +1(%[sp0]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_r], +1(%[sq0]) \n\t" - "sb %[q1_r], +1(%[sq1]) \n\t" - "sb %[q2_r], +1(%[sq2]) \n\t" - "sb %[q3_r], +1(%[sq3]) \n\t" - "sb %[q4_r], +1(%[sq4]) \n\t" - "sb %[q5_r], +1(%[sq5]) \n\t" - "sb %[q6_r], +1(%[sq6]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r_f1], +1(%[sp2]) \n\t" - "sb %[p1_r_f1], +1(%[sp1]) \n\t" - "sb %[p0_r_f1], +1(%[sp0]) \n\t" - "sb %[q0_r_f1], +1(%[sq0]) \n\t" - "sb %[q1_r_f1], +1(%[sq1]) \n\t" - "sb %[q2_r_f1], +1(%[sq2]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], +1(%[sp1]) \n\t" - "sb %[p0_f0], +1(%[sp0]) \n\t" - "sb %[q0_f0], +1(%[sq0]) \n\t" - "sb %[q1_f0], +1(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p6_l], +2(%[sp6]) \n\t" - "sb %[p5_l], +2(%[sp5]) \n\t" - "sb %[p4_l], +2(%[sp4]) \n\t" - "sb %[p3_l], +2(%[sp3]) \n\t" - "sb %[p2_l], +2(%[sp2]) \n\t" - "sb %[p1_l], +2(%[sp1]) \n\t" - "sb %[p0_l], +2(%[sp0]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_l], +2(%[sq0]) \n\t" - "sb %[q1_l], +2(%[sq1]) \n\t" - "sb %[q2_l], +2(%[sq2]) \n\t" - "sb %[q3_l], +2(%[sq3]) \n\t" - "sb %[q4_l], +2(%[sq4]) \n\t" - "sb %[q5_l], +2(%[sq5]) \n\t" - "sb %[q6_l], +2(%[sq6]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), - [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); - } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l_f1], +2(%[sp2]) \n\t" - "sb %[p1_l_f1], +2(%[sp1]) \n\t" - "sb %[p0_l_f1], +2(%[sp0]) \n\t" - "sb %[q0_l_f1], +2(%[sq0]) \n\t" - "sb %[q1_l_f1], +2(%[sq1]) \n\t" - "sb %[q2_l_f1], +2(%[sq2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], +2(%[sp1]) \n\t" - "sb %[p0_f0], +2(%[sp0]) \n\t" - "sb %[q0_f0], +2(%[sq0]) \n\t" - "sb %[q1_f0], +2(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - - __asm__ __volatile__( - "srl %[p6_l], %[p6_l], 16 \n\t" - "srl %[p5_l], %[p5_l], 16 \n\t" - "srl %[p4_l], %[p4_l], 16 \n\t" - "srl %[p3_l], %[p3_l], 16 \n\t" - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[q3_l], %[q3_l], 16 \n\t" - "srl %[q4_l], %[q4_l], 16 \n\t" - "srl %[q5_l], %[q5_l], 16 \n\t" - "srl %[q6_l], %[q6_l], 16 \n\t" - - : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), - [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), - [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), - [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) - :); - - __asm__ __volatile__( - "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" - "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" - "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" - "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" - "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" - "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), - [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), - [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__( - "sb %[p6_l], +3(%[sp6]) \n\t" - "sb %[p5_l], +3(%[sp5]) \n\t" - "sb %[p4_l], +3(%[sp4]) \n\t" - "sb %[p3_l], +3(%[sp3]) \n\t" - "sb %[p2_l], +3(%[sp2]) \n\t" - "sb %[p1_l], +3(%[sp1]) \n\t" - "sb %[p0_l], +3(%[sp0]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), - [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); - - __asm__ __volatile__( - "sb %[q0_l], +3(%[sq0]) \n\t" - "sb %[q1_l], +3(%[sq1]) \n\t" - "sb %[q2_l], +3(%[sq2]) \n\t" - "sb %[q3_l], +3(%[sq3]) \n\t" - "sb %[q4_l], +3(%[sq4]) \n\t" - "sb %[q5_l], +3(%[sq5]) \n\t" - "sb %[q6_l], +3(%[sq6]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), - [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); - } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l_f1], +3(%[sp2]) \n\t" - "sb %[p1_l_f1], +3(%[sp1]) \n\t" - "sb %[p0_l_f1], +3(%[sp0]) \n\t" - "sb %[q0_l_f1], +3(%[sq0]) \n\t" - "sb %[q1_l_f1], +3(%[sq1]) \n\t" - "sb %[q2_l_f1], +3(%[sq2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), - [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), - [sq2] "r"(sq2)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], +3(%[sp1]) \n\t" - "sb %[p0_f0], +3(%[sp0]) \n\t" - "sb %[q0_f0], +3(%[sq0]) \n\t" - "sb %[q1_f0], +3(%[sq1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), - [sq0] "r"(sq0), [sq1] "r"(sq1)); - } - } - - s = s + 4; - } -} - -void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); -} - -void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c deleted file mode 100644 index 3d3f1ec97..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c +++ /dev/null @@ -1,758 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/mips/common_dspr2.h" -#include "aom_dsp/mips/loopfilter_filters_dspr2.h" -#include "aom_dsp/mips/loopfilter_macros_dspr2.h" -#include "aom_dsp/mips/loopfilter_masks_dspr2.h" -#include "aom_mem/aom_mem.h" - -#if HAVE_DSPR2 -void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat, flat2; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; - - uflimit = *blimit; - ulimit = *limit; - uthresh = *thresh; - - /* create quad-byte */ - __asm__ __volatile__( - "replv.qb %[thresh_vec], %[uthresh] \n\t" - "replv.qb %[flimit_vec], %[uflimit] \n\t" - "replv.qb %[limit_vec], %[ulimit] \n\t" - - : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), - [limit_vec] "=r"(limit_vec) - : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); - - prefetch_store(s + pitch); - - for (i = 0; i < 2; i++) { - s1 = s; - s2 = s + pitch; - s3 = s2 + pitch; - s4 = s3 + pitch; - s = s4 + pitch; - - __asm__ __volatile__( - "lw %[p0], -4(%[s1]) \n\t" - "lw %[p1], -4(%[s2]) \n\t" - "lw %[p2], -4(%[s3]) \n\t" - "lw %[p3], -4(%[s4]) \n\t" - "lw %[p4], -8(%[s1]) \n\t" - "lw %[p5], -8(%[s2]) \n\t" - "lw %[p6], -8(%[s3]) \n\t" - "lw %[p7], -8(%[s4]) \n\t" - - : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), - [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - __asm__ __volatile__( - "lw %[q3], (%[s1]) \n\t" - "lw %[q2], (%[s2]) \n\t" - "lw %[q1], (%[s3]) \n\t" - "lw %[q0], (%[s4]) \n\t" - "lw %[q7], +4(%[s1]) \n\t" - "lw %[q6], +4(%[s2]) \n\t" - "lw %[q5], +4(%[s3]) \n\t" - "lw %[q4], +4(%[s4]) \n\t" - - : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), - [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) - : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - - /* transpose p3, p2, p1, p0 - original (when loaded from memory) - register -4 -3 -2 -1 - p0 p0_0 p0_1 p0_2 p0_3 - p1 p1_0 p1_1 p1_2 p1_3 - p2 p2_0 p2_1 p2_2 p2_3 - p3 p3_0 p3_1 p3_2 p3_3 - - after transpose - register - p0 p3_3 p2_3 p1_3 p0_3 - p1 p3_2 p2_2 p1_2 p0_2 - p2 p3_1 p2_1 p1_1 p0_1 - p3 p3_0 p2_0 p1_0 p0_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" - "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" - "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" - "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" - - "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" - "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" - "append %[p1], %[sec3], 16 \n\t" - "append %[p3], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), - [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q0, q1, q2, q3 - original (when loaded from memory) - register +1 +2 +3 +4 - q3 q3_0 q3_1 q3_2 q3_3 - q2 q2_0 q2_1 q2_2 q2_3 - q1 q1_0 q1_1 q1_2 q1_3 - q0 q0_0 q0_1 q0_2 q0_3 - - after transpose - register - q3 q0_3 q1_3 q2_3 q3_3 - q2 q0_2 q1_2 q2_2 q3_2 - q1 q0_1 q1_1 q2_1 q3_1 - q0 q0_0 q1_0 q2_0 q3_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" - "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" - "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" - "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" - - "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" - "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" - "append %[q2], %[sec3], 16 \n\t" - "append %[q0], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), - [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose p7, p6, p5, p4 - original (when loaded from memory) - register -8 -7 -6 -5 - p4 p4_0 p4_1 p4_2 p4_3 - p5 p5_0 p5_1 p5_2 p5_3 - p6 p6_0 p6_1 p6_2 p6_3 - p7 p7_0 p7_1 p7_2 p7_3 - - after transpose - register - p4 p7_3 p6_3 p5_3 p4_3 - p5 p7_2 p6_2 p5_2 p4_2 - p6 p7_1 p6_1 p5_1 p4_1 - p7 p7_0 p6_0 p5_0 p4_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" - "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" - "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" - "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" - - "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" - "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" - "append %[p5], %[sec3], 16 \n\t" - "append %[p7], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), - [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - /* transpose q4, q5, q6, q7 - original (when loaded from memory) - register +5 +6 +7 +8 - q7 q7_0 q7_1 q7_2 q7_3 - q6 q6_0 q6_1 q6_2 q6_3 - q5 q5_0 q5_1 q5_2 q5_3 - q4 q4_0 q4_1 q4_2 q4_3 - - after transpose - register - q7 q4_3 q5_3 q26_3 q7_3 - q6 q4_2 q5_2 q26_2 q7_2 - q5 q4_1 q5_1 q26_1 q7_1 - q4 q4_0 q5_0 q26_0 q7_0 - */ - __asm__ __volatile__( - "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" - "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" - "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" - "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" - - "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" - "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" - "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" - "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" - - "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" - "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" - "append %[q6], %[sec3], 16 \n\t" - "append %[q4], %[sec4], 16 \n\t" - - : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), - [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), - [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) - :); - - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, - p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); - - flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); - - /* f0 */ - if (((flat2 == 0) && (flat == 0) && (mask != 0)) || - ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - STORE_F0() - } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && - (mask == 0xFFFFFFFF)) { - /* f2 */ - PACK_LEFT_0TO3() - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_0TO3() - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - STORE_F2() - } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { - /* f1 */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - STORE_F1() - } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { - /* f0 + f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - /* left 2 element operation */ - PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); - - /* right 2 element operation */ - PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); - - if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), - [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - - : - : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), - [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), - [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], +1(%[s1]) \n\t" - "sb %[q2_l], +2(%[s1]) \n\t" - - : - : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), - [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { - /* f0+f1+f2 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - - PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, - &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); - - PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, - &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); - - PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, - &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, - &q6_l, &q7_l); - - PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, - &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, - &q6_r, &q7_r); - - if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__( - "sb %[p6_r], -7(%[s4]) \n\t" - "sb %[p5_r], -6(%[s4]) \n\t" - "sb %[p4_r], -5(%[s4]) \n\t" - "sb %[p3_r], -4(%[s4]) \n\t" - "sb %[p2_r], -3(%[s4]) \n\t" - "sb %[p1_r], -2(%[s4]) \n\t" - "sb %[p0_r], -1(%[s4]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [s4] "r"(s4)); - - __asm__ __volatile__( - "sb %[q0_r], (%[s4]) \n\t" - "sb %[q1_r], +1(%[s4]) \n\t" - "sb %[q2_r], +2(%[s4]) \n\t" - "sb %[q3_r], +3(%[s4]) \n\t" - "sb %[q4_r], +4(%[s4]) \n\t" - "sb %[q5_r], +5(%[s4]) \n\t" - "sb %[q6_r], +6(%[s4]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [s4] "r"(s4)); - } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__( - "sb %[p2_r_f1], -3(%[s4]) \n\t" - "sb %[p1_r_f1], -2(%[s4]) \n\t" - "sb %[p0_r_f1], -1(%[s4]) \n\t" - "sb %[q0_r_f1], (%[s4]) \n\t" - "sb %[q1_r_f1], +1(%[s4]) \n\t" - "sb %[q2_r_f1], +2(%[s4]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); - } else if (mask & 0x000000FF) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s4]) \n\t" - "sb %[p0_f0], -1(%[s4]) \n\t" - "sb %[q0_f0], (%[s4]) \n\t" - "sb %[q1_f0], +1(%[s4]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s4] "r"(s4)); - } - - __asm__ __volatile__( - "srl %[p6_r], %[p6_r], 16 \n\t" - "srl %[p5_r], %[p5_r], 16 \n\t" - "srl %[p4_r], %[p4_r], 16 \n\t" - "srl %[p3_r], %[p3_r], 16 \n\t" - "srl %[p2_r], %[p2_r], 16 \n\t" - "srl %[p1_r], %[p1_r], 16 \n\t" - "srl %[p0_r], %[p0_r], 16 \n\t" - "srl %[q0_r], %[q0_r], 16 \n\t" - "srl %[q1_r], %[q1_r], 16 \n\t" - "srl %[q2_r], %[q2_r], 16 \n\t" - "srl %[q3_r], %[q3_r], 16 \n\t" - "srl %[q4_r], %[q4_r], 16 \n\t" - "srl %[q5_r], %[q5_r], 16 \n\t" - "srl %[q6_r], %[q6_r], 16 \n\t" - - : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), - [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), - [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), - [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), - [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) - :); - - __asm__ __volatile__( - "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" - "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" - "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" - "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" - "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" - "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), - [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), - [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p6_r], -7(%[s3]) \n\t" - "sb %[p5_r], -6(%[s3]) \n\t" - "sb %[p4_r], -5(%[s3]) \n\t" - "sb %[p3_r], -4(%[s3]) \n\t" - "sb %[p2_r], -3(%[s3]) \n\t" - "sb %[p1_r], -2(%[s3]) \n\t" - "sb %[p0_r], -1(%[s3]) \n\t" - - : - : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), - [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), - [p0_r] "r"(p0_r), [s3] "r"(s3)); - - __asm__ __volatile__( - "sb %[q0_r], (%[s3]) \n\t" - "sb %[q1_r], +1(%[s3]) \n\t" - "sb %[q2_r], +2(%[s3]) \n\t" - "sb %[q3_r], +3(%[s3]) \n\t" - "sb %[q4_r], +4(%[s3]) \n\t" - "sb %[q5_r], +5(%[s3]) \n\t" - "sb %[q6_r], +6(%[s3]) \n\t" - - : - : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), - [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), - [q6_r] "r"(q6_r), [s3] "r"(s3)); - } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p2_r_f1], -3(%[s3]) \n\t" - "sb %[p1_r_f1], -2(%[s3]) \n\t" - "sb %[p0_r_f1], -1(%[s3]) \n\t" - "sb %[q0_r_f1], (%[s3]) \n\t" - "sb %[q1_r_f1], +1(%[s3]) \n\t" - "sb %[q2_r_f1], +2(%[s3]) \n\t" - - : - : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), - [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), - [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); - } else if (mask & 0x0000FF00) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s3]) \n\t" - "sb %[p0_f0], -1(%[s3]) \n\t" - "sb %[q0_f0], (%[s3]) \n\t" - "sb %[q1_f0], +1(%[s3]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s3] "r"(s3)); - } - - __asm__ __volatile__( - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p6_l], -7(%[s2]) \n\t" - "sb %[p5_l], -6(%[s2]) \n\t" - "sb %[p4_l], -5(%[s2]) \n\t" - "sb %[p3_l], -4(%[s2]) \n\t" - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [s2] "r"(s2)); - - __asm__ __volatile__( - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - "sb %[q3_l], +3(%[s2]) \n\t" - "sb %[q4_l], +4(%[s2]) \n\t" - "sb %[q5_l], +5(%[s2]) \n\t" - "sb %[q6_l], +6(%[s2]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [s2] "r"(s2)); - } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p2_l_f1], -3(%[s2]) \n\t" - "sb %[p1_l_f1], -2(%[s2]) \n\t" - "sb %[p0_l_f1], -1(%[s2]) \n\t" - "sb %[q0_l_f1], (%[s2]) \n\t" - "sb %[q1_l_f1], +1(%[s2]) \n\t" - "sb %[q2_l_f1], +2(%[s2]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); - } else if (mask & 0x00FF0000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s2]) \n\t" - "sb %[p0_f0], -1(%[s2]) \n\t" - "sb %[q0_f0], (%[s2]) \n\t" - "sb %[q1_f0], +1(%[s2]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s2] "r"(s2)); - } - - __asm__ __volatile__( - "srl %[p6_l], %[p6_l], 16 \n\t" - "srl %[p5_l], %[p5_l], 16 \n\t" - "srl %[p4_l], %[p4_l], 16 \n\t" - "srl %[p3_l], %[p3_l], 16 \n\t" - "srl %[p2_l], %[p2_l], 16 \n\t" - "srl %[p1_l], %[p1_l], 16 \n\t" - "srl %[p0_l], %[p0_l], 16 \n\t" - "srl %[q0_l], %[q0_l], 16 \n\t" - "srl %[q1_l], %[q1_l], 16 \n\t" - "srl %[q2_l], %[q2_l], 16 \n\t" - "srl %[q3_l], %[q3_l], 16 \n\t" - "srl %[q4_l], %[q4_l], 16 \n\t" - "srl %[q5_l], %[q5_l], 16 \n\t" - "srl %[q6_l], %[q6_l], 16 \n\t" - - : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), - [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), - [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), - [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), - [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) - :); - - __asm__ __volatile__( - "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" - "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" - "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" - "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" - "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" - "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" - "srl %[p1_f0], %[p1_f0], 8 \n\t" - "srl %[p0_f0], %[p0_f0], 8 \n\t" - "srl %[q0_f0], %[q0_f0], 8 \n\t" - "srl %[q1_f0], %[q1_f0], 8 \n\t" - - : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), - [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), - [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), - [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), - [q1_f0] "+r"(q1_f0) - :); - - if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__( - "sb %[p6_l], -7(%[s1]) \n\t" - "sb %[p5_l], -6(%[s1]) \n\t" - "sb %[p4_l], -5(%[s1]) \n\t" - "sb %[p3_l], -4(%[s1]) \n\t" - "sb %[p2_l], -3(%[s1]) \n\t" - "sb %[p1_l], -2(%[s1]) \n\t" - "sb %[p0_l], -1(%[s1]) \n\t" - - : - : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), - [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), - [p0_l] "r"(p0_l), [s1] "r"(s1)); - - __asm__ __volatile__( - "sb %[q0_l], (%[s1]) \n\t" - "sb %[q1_l], 1(%[s1]) \n\t" - "sb %[q2_l], 2(%[s1]) \n\t" - "sb %[q3_l], 3(%[s1]) \n\t" - "sb %[q4_l], 4(%[s1]) \n\t" - "sb %[q5_l], 5(%[s1]) \n\t" - "sb %[q6_l], 6(%[s1]) \n\t" - - : - : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), - [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), - [q6_l] "r"(q6_l), [s1] "r"(s1)); - } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__( - "sb %[p2_l_f1], -3(%[s1]) \n\t" - "sb %[p1_l_f1], -2(%[s1]) \n\t" - "sb %[p0_l_f1], -1(%[s1]) \n\t" - "sb %[q0_l_f1], (%[s1]) \n\t" - "sb %[q1_l_f1], +1(%[s1]) \n\t" - "sb %[q2_l_f1], +2(%[s1]) \n\t" - - : - : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), - [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), - [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); - } else if (mask & 0xFF000000) { - __asm__ __volatile__( - "sb %[p1_f0], -2(%[s1]) \n\t" - "sb %[p0_f0], -1(%[s1]) \n\t" - "sb %[q0_f0], (%[s1]) \n\t" - "sb %[q1_f0], +1(%[s1]) \n\t" - - : - : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), - [q1_f0] "r"(q1_f0), [s1] "r"(s1)); - } - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h deleted file mode 100644 index 54b0bb4bd..000000000 --- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ -#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" - -#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - filt = filt & (v16i8)hev_in; \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - /* combine left and right part */ \ - filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ - \ - filt = filt & (v16i8)mask_in; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ - v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp_flat4 = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp_flat4 < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ - } - -#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ - q6_in, q7_in, flat_in, flat2_out) \ - { \ - v16u8 tmp_flat5, zero_in = { 0 }; \ - v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ - v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ - \ - tmp_flat5 = __msa_ori_b(zero_in, 1); \ - p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ - q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ - p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ - q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ - p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ - q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ - p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ - q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ - \ - p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ - flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ - flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ - p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ - flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ - p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ - flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ - \ - flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ - flat2_out = __msa_xori_b(flat2_out, 0xff); \ - flat2_out = flat2_out & flat_in; \ - } - -#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ - p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ - q1_filt8_out, q2_filt8_out) \ - { \ - v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ - \ - tmp_filt8_2 = p2_in + p1_in + p0_in; \ - tmp_filt8_0 = p3_in << 1; \ - \ - tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ - tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = q2_in + q1_in + q0_in; \ - tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ - tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ - tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ - \ - tmp_filt8_0 = q2_in + q3_in; \ - tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ - tmp_filt8_1 = q3_in + q3_in; \ - tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_0 = tmp_filt8_2 + q3_in; \ - tmp_filt8_1 = tmp_filt8_0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - \ - tmp_filt8_1 = tmp_filt8_0 - p2_in; \ - tmp_filt8_0 = q1_in + q3_in; \ - tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ - } - -#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ - limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ - flat_out) \ - { \ - v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ - v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ - \ - /* absolute subtraction of pixel values */ \ - p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ - p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ - p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ - q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ - q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ - q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ - p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ - p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ - \ - /* calculation of hev */ \ - flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ - hev_out = thresh_in < (v16u8)flat_out; \ - \ - /* calculation of mask */ \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ - p1_asub_q1_m >>= 1; \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ - \ - mask_out = b_limit_in < p0_asub_q0_m; \ - mask_out = __msa_max_u_b(flat_out, mask_out); \ - p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ - mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ - q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ - mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ - \ - mask_out = limit_in < (v16u8)mask_out; \ - mask_out = __msa_xori_b(mask_out, 0xff); \ - } -#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h deleted file mode 100644 index 9bfc27147..000000000 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ /dev/null @@ -1,2058 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ -#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ - -#include <msa.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) - -#if (__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ - }) -#endif // (__mips == 64) - -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } -#else // !(__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) - -#if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ - }) -#else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m_combined = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m_combined = (uint64_t)(val1_m); \ - val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ - val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ - \ - val_m_combined; \ - }) -#endif // (__mips == 64) - -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ - } -#endif // (__mips_isa_rev >= 6) - -/* Description : Load 4 words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1, out2, out3 - Details : Load word in 'out0' from (psrc) - Load word in 'out1' from (psrc + stride) - Load word in 'out2' from (psrc + 2 * stride) - Load word in 'out3' from (psrc + 3 * stride) -*/ -#define LW4(psrc, stride, out0, out1, out2, out3) \ - { \ - out0 = LW((psrc)); \ - out1 = LW((psrc) + stride); \ - out2 = LW((psrc) + 2 * stride); \ - out3 = LW((psrc) + 3 * stride); \ - } - -/* Description : Load double words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load double word in 'out0' from (psrc) - Load double word in 'out1' from (psrc + stride) -*/ -#define LD2(psrc, stride, out0, out1) \ - { \ - out0 = LD((psrc)); \ - out1 = LD((psrc) + stride); \ - } -#define LD4(psrc, stride, out0, out1, out2, out3) \ - { \ - LD2((psrc), stride, out0, out1); \ - LD2((psrc) + 2 * stride, stride, out2, out3); \ - } - -/* Description : Store 4 words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store word from 'in0' to (pdst) - Store word from 'in1' to (pdst + stride) - Store word from 'in2' to (pdst + 2 * stride) - Store word from 'in3' to (pdst + 3 * stride) -*/ -#define SW4(in0, in1, in2, in3, pdst, stride) \ - { \ - SW(in0, (pdst)) \ - SW(in1, (pdst) + stride); \ - SW(in2, (pdst) + 2 * stride); \ - SW(in3, (pdst) + 3 * stride); \ - } - -/* Description : Store 4 double words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store double word from 'in0' to (pdst) - Store double word from 'in1' to (pdst + stride) - Store double word from 'in2' to (pdst + 2 * stride) - Store double word from 'in3' to (pdst + 3 * stride) -*/ -#define SD4(in0, in1, in2, in3, pdst, stride) \ - { \ - SD(in0, (pdst)) \ - SD(in1, (pdst) + stride); \ - SD(in2, (pdst) + 2 * stride); \ - SD(in3, (pdst) + 3 * stride); \ - } - -/* Description : Load vectors with 16 byte elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Load 16 byte elements in 'out0' from (psrc) - Load 16 byte elements in 'out1' from (psrc + stride) -*/ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ - } -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) - -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ - } -#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) - -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) - -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ - { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ - } -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) - -#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ - { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ - } -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) - -#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) - -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ - } -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) - -#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7, out8, out9, out10, out11, out12, out13, out14, out15) \ - { \ - LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ - out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ - out13, out14, out15); \ - } -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) - -/* Description : Load 4x4 block of signed halfword elements from 1D source - data into 4 vectors (Each vector with 4 signed halfwords) - Arguments : Input - psrc - Outputs - out0, out1, out2, out3 -*/ -#define LD4x4_SH(psrc, out0, out1, out2, out3) \ - { \ - out0 = LD_SH(psrc); \ - out2 = LD_SH(psrc + 8); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - } - -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) \ - { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ - } - -/* Description : Store vectors of 16 byte elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 16 byte elements from 'in0' to (pdst) - Store 16 byte elements from 'in1' to (pdst + stride) -*/ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ - } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) - -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) - -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) \ - { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ - } - -/* Description : Store 2x4 byte block to destination memory from input vector - Arguments : Inputs - in, stidx, pdst, stride - Details : Index 'stidx' halfword element from 'in' vector is copied to - the GP register and stored to (pdst) - Index 'stidx+1' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + stride) - Index 'stidx+2' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 2 * stride) - Index 'stidx+3' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 3 * stride) -*/ -#define ST2x4_UB(in, stidx, pdst, stride) \ - { \ - uint16_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ - out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ - out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ - out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ - \ - SH(out0_m, pblk_2x4_m); \ - SH(out1_m, pblk_2x4_m + stride); \ - SH(out2_m, pblk_2x4_m + 2 * stride); \ - SH(out3_m, pblk_2x4_m + 3 * stride); \ - } - -/* Description : Store 4x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 word element from 'in' vector is copied to the GP - register and stored to (pdst) - Index 1 word element from 'in' vector is copied to the GP - register and stored to (pdst + stride) -*/ -#define ST4x2_UB(in, pdst, stride) \ - { \ - uint32_t out0_m, out1_m; \ - uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in, 0); \ - out1_m = __msa_copy_u_w((v4i32)in, 1); \ - \ - SW(out0_m, pblk_4x2_m); \ - SW(out1_m, pblk_4x2_m + stride); \ - } - -/* Description : Store 4x4 byte block to destination memory from input vector - Arguments : Inputs - in0, in1, pdst, stride - Details : 'Idx0' word element from input vector 'in0' is copied to the - GP register and stored to (pdst) - 'Idx1' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + stride) - 'Idx2' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 2 * stride) - 'Idx3' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ - { \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ - out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ - out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ - out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ - \ - SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ - } -#define ST4x8_UB(in0, in1, pdst, stride) \ - { \ - uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ - \ - ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ - ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ - } - -/* Description : Store 8x1 byte block to destination memory from input vector - Arguments : Inputs - in, pdst - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) -*/ -#define ST8x1_UB(in, pdst) \ - { \ - uint64_t out0_m; \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - SD(out0_m, pdst); \ - } - -/* Description : Store 8x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in' vector is copied to the - GP register and stored to (pdst + stride) -*/ -#define ST8x2_UB(in, pdst, stride) \ - { \ - uint64_t out0_m, out1_m; \ - uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - out1_m = __msa_copy_u_d((v2i64)in, 1); \ - \ - SD(out0_m, pblk_8x2_m); \ - SD(out1_m, pblk_8x2_m + stride); \ - } - -/* Description : Store 8x4 byte block to destination memory from input - vectors - Arguments : Inputs - in0, in1, pdst, stride - Details : Index 0 double word element from 'in0' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in0' vector is copied to the - GP register and stored to (pdst + stride) - Index 0 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 2 * stride) - Index 1 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST8x4_UB(in0, in1, pdst, stride) \ - { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in0, 0); \ - out1_m = __msa_copy_u_d((v2i64)in0, 1); \ - out2_m = __msa_copy_u_d((v2i64)in1, 0); \ - out3_m = __msa_copy_u_d((v2i64)in1, 1); \ - \ - SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ - } - -/* Description : average with rounding (in0 + in1 + 1) / 2. - Arguments : Inputs - in0, in1, in2, in3, - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned byte element from 'in0' vector is added with - each unsigned byte element from 'in1' vector. Then the average - with rounding is calculated and written to 'out0' -*/ -#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ - out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ - } -#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) - -#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ - } -#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide with zero - Arguments : Inputs - in0, in1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'zero_m' vector are slid into 'in0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ - { \ - v16i8 zero_m = { 0 }; \ - out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ - } -#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) - -#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ - slide_val) \ - { \ - SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ - SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ - } -#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide - Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - { \ - out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ - } -#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) -#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) - -#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ - out2, slide_val) \ - { \ - SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ - } -#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) -#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) - -/* Description : Shuffle byte vector elements as per mask vector - Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0' & 'in1' are copied selectively to - 'out0' as per control vector 'mask0' -*/ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ - } -#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) -#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) -#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) - -#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ - out3) \ - { \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ - } -#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) -#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Unsigned byte elements from 'mult0' are multiplied with - unsigned byte elements from 'cnst0' producing a result - twice the size of input i.e. unsigned halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ - out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ - } -#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) - -#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ - } -#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) - -#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ - } -#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) - -#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) - -/* Description : Dot product of word vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed word elements from 'mult0' are multiplied with - signed word elements from 'cnst0' producing a result - twice the size of input i.e. signed double word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ - } -#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) - -/* Description : Dot product & addition of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ - } -#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) - -#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ - cnst3, out0, out1, out2, out3) \ - { \ - DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ - } -#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product & addition of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ - } -#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) - -/* Description : Dot product & addition of double word vector elements - Arguments : Inputs - mult0, mult1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed word element from 'mult0' is multiplied with itself - producing an intermediate result twice the size of input - i.e. signed double word - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ - out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ - } -#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) - -/* Description : Minimum values between unsigned elements of - either vector are copied to the output vector - Arguments : Inputs - in0, in1, min_vec - Outputs - in place operation - Return Type - as per RTYPE - Details : Minimum of unsigned halfword element values from 'in0' and - 'min_vec' are written to output vector 'in0' -*/ -#define MIN_UH2(RTYPE, in0, in1, min_vec) \ - { \ - in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ - in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ - } -#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) - -#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ - { \ - MIN_UH2(RTYPE, in0, in1, min_vec); \ - MIN_UH2(RTYPE, in2, in3, min_vec); \ - } -#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) - -/* Description : Clips all signed halfword elements of input vector - between 0 & 255 - Arguments : Input - in - Output - out_m - Return Type - signed halfword -*/ -#define CLIP_SH_0_255(in) \ - ({ \ - v8i16 max_m = __msa_ldi_h(255); \ - v8i16 out_m; \ - \ - out_m = __msa_maxi_s_h((v8i16)in, 0); \ - out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ - out_m; \ - }) -#define CLIP_SH2_0_255(in0, in1) \ - { \ - in0 = CLIP_SH_0_255(in0); \ - in1 = CLIP_SH_0_255(in1); \ - } -#define CLIP_SH4_0_255(in0, in1, in2, in3) \ - { \ - CLIP_SH2_0_255(in0, in1); \ - CLIP_SH2_0_255(in2, in3); \ - } - -/* Description : Horizontal addition of 4 signed word elements of input vector - Arguments : Input - in (signed word vector) - Output - sum_m (i32 sum) - Return Type - signed word (GP) - Details : 4 signed word elements of 'in' vector are added together and - the resulting integer sum is returned -*/ -#define HADD_SW_S32(in) \ - ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ - }) - -/* Description : Horizontal addition of 8 unsigned halfword elements - Arguments : Inputs - in (unsigned halfword vector) - Outputs - sum_m (u32 sum) - Return Type - unsigned word - Details : 8 unsigned halfword elements of input vector are added - together and the resulting integer sum is returned -*/ -#define HADD_UH_U32(in) \ - ({ \ - v4u32 res_m; \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ - }) - -/* Description : Horizontal addition of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is added to - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ - } -#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) - -#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - HADD_UB2(RTYPE, in0, in1, out0, out1); \ - HADD_UB2(RTYPE, in2, in3, out2, out3); \ - } -#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) - -/* Description : Horizontal subtraction of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is subtracted from - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ - } -#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) - -/* Description : SAD (Sum of Absolute Difference) - Arguments : Inputs - in0, in1, ref0, ref1 - Outputs - sad_m (halfword vector) - Return Type - unsigned halfword - Details : Absolute difference of all the byte elements from 'in0' with - 'ref0' is calculated and preserved in 'diff0'. Then even-odd - pairs are added together to generate 8 halfword results. -*/ -#define SAD_UB2_UH(in0, in1, ref0, ref1) \ - ({ \ - v16u8 diff0_m, diff1_m; \ - v8u16 sad_m = { 0 }; \ - \ - diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ - diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ - \ - sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ - sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ - \ - sad_m; \ - }) - -/* Description : Horizontal subtraction of signed halfword vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed odd halfword element from 'in0' is subtracted from - even signed halfword element from 'in0' (pairwise) and the - word result is written to 'out0' -*/ -#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ - out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ - } -#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) - -/* Description : Set element n input vector to GPR value - Arguments : Inputs - in0, in1, in2, in3 - Output - out - Return Type - as per RTYPE - Details : Set element 0 in vector 'out' to value specified in 'in0' -*/ -#define INSERT_W2(RTYPE, in0, in1, out) \ - { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - } -#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) - -#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ - { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ - } -#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) -#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) - -#define INSERT_D2(RTYPE, in0, in1, out) \ - { \ - out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ - out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ - } -#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) -#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) - -/* Description : Interleave even byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ - } -#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) -#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) - -/* Description : Interleave even halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ - out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ - } -#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) -#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) -#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) - -/* Description : Interleave even word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ - out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ - } -#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) - -/* Description : Interleave even double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ - out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ - } -#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) - -/* Description : Interleave left half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of byte elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ - } -#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) -#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) -#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) -#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) - -#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) -#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) -#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) - -/* Description : Interleave left half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ - } -#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) -#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) - -/* Description : Interleave left half of word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of word elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ - } -#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) -#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) - -/* Description : Interleave right half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements of 'in0' and 'in1' are interleaved - and written to out0. -*/ -#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ - } -#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) -#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) -#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) -#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) - -#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) -#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) -#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) -#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) - -#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ - in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ - out5, out6, out7) \ - { \ - ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3); \ - ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ - out6, out7); \ - } -#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) - -/* Description : Interleave right half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ - } -#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) -#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) - -#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) - -#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ - } -#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) -#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) - -#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) - -/* Description : Interleave right half of double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of double word elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ - out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ - } -#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) -#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) -#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) - -#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ - { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ - } -#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) - -#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) -#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) - -/* Description : Interleave both left and right half of input vectors - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and written to 'out0' -*/ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - } -#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) -#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) -#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) -#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) - -#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - } -#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) -#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) - -#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - } -#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) -#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) -#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range. - The results are written in place -*/ -#define SAT_UH2(RTYPE, in0, in1, sat_val) \ - { \ - in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ - } -#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) - -#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ - { \ - SAT_UH2(RTYPE, in0, in1, sat_val); \ - SAT_UH2(RTYPE, in2, in3, sat_val) \ - } -#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range - The results are written in place -*/ -#define SAT_SH2(RTYPE, in0, in1, sat_val) \ - { \ - in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ - } -#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) - -#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ - { \ - SAT_SH2(RTYPE, in0, in1, sat_val); \ - SAT_SH2(RTYPE, in2, in3, sat_val); \ - } -#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) - -/* Description : Indexed halfword element values are replicated to all - elements in output vector - Arguments : Inputs - in, idx0, idx1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : 'idx0' element value from 'in' vector is replicated to all - elements in 'out0' vector - Valid index range for halfword operation is 0-7 -*/ -#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ - { \ - out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ - out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ - } -#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) - -#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ - { \ - SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ - SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ - } -#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) -#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even byte elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' are copied to the left half of - 'out0' & even byte elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ - } -#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) -#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) -#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) - -#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) -#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) -#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) - -/* Description : Pack even halfword elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' are copied to the left half of - 'out0' & even halfword elements of 'in1' are copied to the - right half of 'out0'. -*/ -#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ - } -#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) -#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) - -#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even double word elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double elements of 'in0' are copied to the left half of - 'out0' & even double elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ - out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ - } -#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) -#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) - -#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) - -/* Description : Each byte element is logically xor'ed with immediate 128 - Arguments : Inputs - in0, in1 - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned byte element from input vector 'in0' is - logically xor'ed with 128 and the result is stored in-place. -*/ -#define XORI_B2_128(RTYPE, in0, in1) \ - { \ - in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ - in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ - } -#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) -#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) - -#define XORI_B3_128(RTYPE, in0, in1, in2) \ - { \ - XORI_B2_128(RTYPE, in0, in1); \ - in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ - } -#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) - -#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ - { \ - XORI_B2_128(RTYPE, in0, in1); \ - XORI_B2_128(RTYPE, in2, in3); \ - } -#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) -#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) - -#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ - { \ - XORI_B4_128(RTYPE, in0, in1, in2, in3); \ - XORI_B3_128(RTYPE, in4, in5, in6); \ - } -#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) - -/* Description : Average of signed halfword elements -> (a + b) / 2 - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each signed halfword element from 'in0' is added to each - signed halfword element of 'in1' with full precision resulting - in one extra bit in the result. The result is then divided by - 2 and written to 'out0' -*/ -#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ - out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ - out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ - } -#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) - -/* Description : Addition of signed halfword elements and signed saturation - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'in0' are added to signed - halfword elements of 'in1'. The result is then signed saturated - between halfword data type range -*/ -#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ - out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ - } -#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) - -#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3) \ - { \ - ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ - } -#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) - -/* Description : Shift left all elements of vector (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is left shifted by 'shift' and - the result is written in-place. -*/ -#define SLLI_4V(in0, in1, in2, in3, shift) \ - { \ - in0 = in0 << shift; \ - in1 = in1 << shift; \ - in2 = in2 << shift; \ - in3 = in3 << shift; \ - } - -/* Description : Arithmetic shift right all elements of vector - (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is a GP variable. -*/ -#define SRA_4V(in0, in1, in2, in3, shift) \ - { \ - in0 = in0 >> shift; \ - in1 = in1 >> shift; \ - in2 = in2 >> shift; \ - in3 = in3 >> shift; \ - } - -/* Description : Shift right arithmetic rounded words - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the number of bits in the corresponding element in the vector - 'shift'. The last discarded bit is added to shifted value for - rounding and the result is written in-place. - 'shift' is a vector. -*/ -#define SRAR_W2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ - in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ - } - -#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRAR_W2(RTYPE, in0, in1, shift) \ - SRAR_W2(RTYPE, in2, in3, shift) \ - } -#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) - -/* Description : Shift right arithmetic rounded (immediate) - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the value in 'shift'. The last discarded bit is added to the - shifted value for rounding and the result is written in-place. - 'shift' is an immediate value. -*/ -#define SRARI_H2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ - in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ - } -#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) -#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) - -#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRARI_H2(RTYPE, in0, in1, shift); \ - SRARI_H2(RTYPE, in2, in3, shift); \ - } -#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) -#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) - -#define SRARI_W2(RTYPE, in0, in1, shift) \ - { \ - in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ - in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ - } -#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) - -#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ - { \ - SRARI_W2(RTYPE, in0, in1, shift); \ - SRARI_W2(RTYPE, in2, in3, shift); \ - } -#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) - -/* Description : Logical shift right all elements of vector (immediate) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is an immediate value. -*/ -#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ - { \ - out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ - out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ - out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ - out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ - } -#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) - -/* Description : Multiplication of pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element from 'in0' is multiplied with elements from 'in1' - and the result is written to 'out0' -*/ -#define MUL2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 * in1; \ - out1 = in2 * in3; \ - } -#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - MUL2(in0, in1, in2, in3, out0, out1); \ - MUL2(in4, in5, in6, in7, out2, out3); \ - } - -/* Description : Addition of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in0' is added to 'in1' and result is written - to 'out0'. -*/ -#define ADD2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 + in1; \ - out1 = in2 + in3; \ - } -#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - ADD2(in0, in1, in2, in3, out0, out1); \ - ADD2(in4, in5, in6, in7, out2, out3); \ - } - -/* Description : Subtraction of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in1' is subtracted from 'in0' and result is - written to 'out0'. -*/ -#define SUB2(in0, in1, in2, in3, out0, out1) \ - { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - } -#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ - { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - out2 = in4 - in5; \ - out3 = in6 - in7; \ - } - -/* Description : Sign extend halfword elements from right half of the vector - Arguments : Input - in (halfword vector) - Output - out (sign extended word vector) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved with same vector 'in0' to generate - 4 word elements keeping sign intact -*/ -#define UNPCK_R_SH_SW(in, out) \ - { \ - v8i16 sign_m; \ - \ - sign_m = __msa_clti_s_h((v8i16)in, 0); \ - out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ - } - -/* Description : Zero extend unsigned byte elements to halfword elements - Arguments : Input - in (unsigned byte vector) - Outputs - out0, out1 (unsigned halfword vectors) - Return Type - signed halfword - Details : Zero extended right half of vector is returned in 'out0' - Zero extended left half of vector is returned in 'out1' -*/ -#define UNPCK_UB_SH(in, out0, out1) \ - { \ - v16i8 zero_m = { 0 }; \ - \ - ILVRL_B2_SH(zero_m, in, out0, out1); \ - } - -/* Description : Sign extend halfword elements from input vector and return - the result in pair of vectors - Arguments : Input - in (halfword vector) - Outputs - out0, out1 (sign extended word vectors) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved right with same vector 'in0' to - generate 4 signed word elements in 'out0' - Then interleaved left with same vector 'in0' to - generate 4 signed word elements in 'out1' -*/ -#define UNPCK_SH_SW(in, out0, out1) \ - { \ - v8i16 tmp_m; \ - \ - tmp_m = __msa_clti_s_h((v8i16)in, 0); \ - ILVRL_H2_SW(tmp_m, in, out0, out1); \ - } - -/* Description : Butterfly of 4 input vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Details : Butterfly operation -*/ -#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - out0 = in0 + in3; \ - out1 = in1 + in2; \ - \ - out2 = in1 - in2; \ - out3 = in0 - in3; \ - } - -/* Description : Butterfly of 8 input vectors - Arguments : Inputs - in0 ... in7 - Outputs - out0 .. out7 - Details : Butterfly operation -*/ -#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - out0 = in0 + in7; \ - out1 = in1 + in6; \ - out2 = in2 + in5; \ - out3 = in3 + in4; \ - \ - out4 = in3 - in4; \ - out5 = in2 - in5; \ - out6 = in1 - in6; \ - out7 = in0 - in7; \ - } - -/* Description : Butterfly of 16 input vectors - Arguments : Inputs - in0 ... in15 - Outputs - out0 .. out15 - Details : Butterfly operation -*/ -#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ - in11, in12, in13, in14, in15, out0, out1, out2, out3, \ - out4, out5, out6, out7, out8, out9, out10, out11, out12, \ - out13, out14, out15) \ - { \ - out0 = in0 + in15; \ - out1 = in1 + in14; \ - out2 = in2 + in13; \ - out3 = in3 + in12; \ - out4 = in4 + in11; \ - out5 = in5 + in10; \ - out6 = in6 + in9; \ - out7 = in7 + in8; \ - \ - out8 = in7 - in8; \ - out9 = in6 - in9; \ - out10 = in5 - in10; \ - out11 = in4 - in11; \ - out12 = in3 - in12; \ - out13 = in2 - in13; \ - out14 = in1 - in14; \ - out15 = in0 - in15; \ - } - -/* Description : Transpose input 8x8 byte block - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7) \ - { \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ - tmp3_m); \ - ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ - ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ - ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ - ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ - SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ - SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ - } -#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) - -/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - unsigned byte -*/ -#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ - in10, in11, in12, in13, in14, in15, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ - ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ - ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ - ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ - \ - tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ - tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ - tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ - tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ - out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ - tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ - out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ - tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ - \ - ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ - out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ - out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ - out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - } - -/* Description : Transpose 4x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed halfword -*/ -#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 s0_m, s1_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ - ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ - } - -/* Description : Transpose 4x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ - v8i16 zero_m = { 0 }; \ - \ - ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ - tmp3_n); \ - ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ - ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ - \ - out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - \ - out4 = zero_m; \ - out5 = zero_m; \ - out6 = zero_m; \ - out7 = zero_m; \ - } - -/* Description : Transpose 8x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ - ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ - ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ - } - -/* Description : Transpose 8x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ - ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ - ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ - PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ - tmp7_m, out0, out2, out4, out6); \ - out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ - } -#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) - -/* Description : Transpose 4x4 block with word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed word -*/ -#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ - out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ - out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ - out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ - } - -/* Description : Add block 4x4 - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Least significant 4 bytes from each input vector are added to - the destination bytes, clipped between 0-255 and stored. -*/ -#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ - { \ - uint32_t src0_m, src1_m, src2_m, src3_m; \ - v8i16 inp0_m, inp1_m, res0_m, res1_m; \ - v16i8 dst0_m = { 0 }; \ - v16i8 dst1_m = { 0 }; \ - v16i8 zero_m = { 0 }; \ - \ - ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ - LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ - INSERT_W2_SB(src0_m, src1_m, dst0_m); \ - INSERT_W2_SB(src2_m, src3_m, dst1_m); \ - ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ - ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ - CLIP_SH2_0_255(res0_m, res1_m); \ - PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ - ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ - } - -/* Description : Pack even elements of input vectors & xor with 128 - Arguments : Inputs - in0, in1 - Output - out_m - Return Type - unsigned byte - Details : Signed byte even elements from 'in0' and 'in1' are packed - together in one vector and the resulting vector is xor'ed with - 128 to shift the range from signed to unsigned byte -*/ -#define PCKEV_XORI128_UB(in0, in1) \ - ({ \ - v16u8 out_m; \ - \ - out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ - out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ - out_m; \ - }) - -/* Description : Converts inputs to unsigned bytes, interleave, average & store - as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride -*/ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ - pdst, stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ - } - -/* Description : Pack even byte elements and store byte vector in destination - memory - Arguments : Inputs - in0, in1, pdst -*/ -#define PCKEV_ST_SB(in0, in1, pdst) \ - { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ - ST_SB(tmp_m, (pdst)); \ - } - -/* Description : Horizontal 2 tap filter kernel code - Arguments : Inputs - in0, in1, mask, coeff, shift -*/ -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ - ({ \ - v16i8 tmp0_m; \ - v8u16 tmp1_m; \ - \ - tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ - tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ - tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ - \ - tmp1_m; \ - }) -#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c deleted file mode 100644 index 58cdd80d9..000000000 --- a/third_party/aom/aom_dsp/mips/sad_msa.c +++ /dev/null @@ -1,800 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ - { \ - out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ - } -#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) - -static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - diff = __msa_asub_u_b(src, ref); - sad += __msa_hadd_u_h(diff, diff); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - LD_UB2(ref, ref_stride, ref0, ref1); - ref += (2 * ref_stride); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - LD_UB2(ref, ref_stride, ref0, ref1); - ref += (2 * ref_stride); - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB2(ref, 16, ref0, ref1); - ref += ref_stride; - sad += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t sad = 0; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = HADD_UH_U32(sad0); - sad += HADD_UH_U32(sad1); - - return sad; -} - -static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - - LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref0_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref1_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref2_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ref3_ptr += (4 * ref_stride); - - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - int32_t ht_cnt; - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; - v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref0_ptr += (4 * ref_stride); - LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); - ref1_ptr += (4 * ref_stride); - LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); - ref2_ptr += (4 * ref_stride); - LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); - ref3_ptr += (4 * ref_stride); - - PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - int32_t ht_cnt; - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - v16u8 src, ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref0 = LD_UB(ref0_ptr); - ref0_ptr += ref_stride; - ref1 = LD_UB(ref1_ptr); - ref1_ptr += ref_stride; - ref2 = LD_UB(ref2_ptr); - ref2_ptr += ref_stride; - ref3 = LD_UB(ref3_ptr); - ref3_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref1); - sad1 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref2); - sad2 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref3); - sad3 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref0 = LD_UB(ref0_ptr); - ref0_ptr += ref_stride; - ref1 = LD_UB(ref1_ptr); - ref1_ptr += ref_stride; - ref2 = LD_UB(ref2_ptr); - ref2_ptr += ref_stride; - ref3 = LD_UB(ref3_ptr); - ref3_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref1); - sad1 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref2); - sad2 += __msa_hadd_u_h(diff, diff); - diff = __msa_asub_u_b(src, ref3); - sad3 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - - LD_UB2(ref0_ptr, 16, ref0, ref1); - ref0_ptr += ref_stride; - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref1_ptr, 16, ref0, ref1); - ref1_ptr += ref_stride; - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref2_ptr, 16, ref0, ref1); - ref2_ptr += ref_stride; - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(ref3_ptr, 16, ref0, ref1); - ref3_ptr += ref_stride; - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); -} - -static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *const aref_ptr[], - int32_t ref_stride, int32_t height, - uint32_t *sad_array) { - const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - - ref0_ptr = aref_ptr[0]; - ref1_ptr = aref_ptr[1]; - ref2_ptr = aref_ptr[2]; - ref3_ptr = aref_ptr[3]; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - - LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); - ref0_ptr += ref_stride; - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); - ref1_ptr += ref_stride; - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); - ref2_ptr += ref_stride; - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); - ref3_ptr += ref_stride; - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad_array[0] = HADD_UH_U32(sad0_0); - sad_array[0] += HADD_UH_U32(sad0_1); - sad_array[1] = HADD_UH_U32(sad1_0); - sad_array[1] += HADD_UH_U32(sad1_1); - sad_array[2] = HADD_UH_U32(sad2_0); - sad_array[2] += HADD_UH_U32(sad2_1); - sad_array[3] = HADD_UH_U32(sad3_0); - sad_array[3] += HADD_UH_U32(sad3_1); -} - -static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 diff, pred, comp; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - comp = __msa_aver_u_b(pred, ref); - diff = __msa_asub_u_b(src, comp); - sad += __msa_hadd_u_h(diff, diff); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 diff0, diff1, pred0, pred1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); - sad += SAD_UB2_UH(src0, src1, diff0, diff1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3, comp0, comp1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 3); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += (4 * 16); - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); - ref += (4 * ref_stride); - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += (4 * 16); - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; - v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 comp0, comp1; - v8u16 sad = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 16, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); - LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); - ref += (4 * ref_stride); - - LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); - LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); - sec_pred += (4 * 32); - - AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); - sad += SAD_UB2_UH(src0, src1, comp0, comp1); - AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); - sad += SAD_UB2_UH(src2, src3, comp0, comp1); - AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); - sad += SAD_UB2_UH(src4, src5, comp0, comp1); - AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); - sad += SAD_UB2_UH(src6, src7, comp0, comp1); - } - - return HADD_UH_U32(sad); -} - -static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, const uint8_t *sec_pred) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 comp0, comp1, comp2, comp3; - v16u8 pred0, pred1, pred2, pred3; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v4u32 sad; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0, ref1, ref2, ref3); - ref += ref_stride; - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, - comp1, comp2, comp3); - sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); - sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); - } - - sad = __msa_hadd_u_w(sad0, sad0); - sad += __msa_hadd_u_w(sad1, sad1); - - return HADD_SW_S32(sad); -} - -#define AOM_SAD_4xHEIGHT_MSA(height) \ - uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_8xHEIGHT_MSA(height) \ - uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_16xHEIGHT_MSA(height) \ - uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_32xHEIGHT_MSA(height) \ - uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_64xHEIGHT_MSA(height) \ - uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ - } - -#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ - void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ - void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ - void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ - void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ - void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ - } - -#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ - uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ - uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ - uint32_t aom_sad16x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ - uint32_t aom_sad32x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ - uint32_t aom_sad64x##height##_avg_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, const uint8_t *second_pred) { \ - return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ - second_pred); \ - } - -/* clang-format off */ -// 64x64 -AOM_SAD_64xHEIGHT_MSA(64) -AOM_SAD_64xHEIGHTx4D_MSA(64) -AOM_AVGSAD_64xHEIGHT_MSA(64) - -// 64x32 -AOM_SAD_64xHEIGHT_MSA(32) -AOM_SAD_64xHEIGHTx4D_MSA(32) -AOM_AVGSAD_64xHEIGHT_MSA(32) - -// 32x64 -AOM_SAD_32xHEIGHT_MSA(64) -AOM_SAD_32xHEIGHTx4D_MSA(64) -AOM_AVGSAD_32xHEIGHT_MSA(64) - -// 32x32 -AOM_SAD_32xHEIGHT_MSA(32) -AOM_SAD_32xHEIGHTx4D_MSA(32) -AOM_AVGSAD_32xHEIGHT_MSA(32) - -// 32x16 -AOM_SAD_32xHEIGHT_MSA(16) -AOM_SAD_32xHEIGHTx4D_MSA(16) -AOM_AVGSAD_32xHEIGHT_MSA(16) - -// 16x32 -AOM_SAD_16xHEIGHT_MSA(32) -AOM_SAD_16xHEIGHTx4D_MSA(32) -AOM_AVGSAD_16xHEIGHT_MSA(32) - -// 16x16 -AOM_SAD_16xHEIGHT_MSA(16) -AOM_SAD_16xHEIGHTx4D_MSA(16) -AOM_AVGSAD_16xHEIGHT_MSA(16) - -// 16x8 -AOM_SAD_16xHEIGHT_MSA(8) -AOM_SAD_16xHEIGHTx4D_MSA(8) -AOM_AVGSAD_16xHEIGHT_MSA(8) - -// 8x16 -AOM_SAD_8xHEIGHT_MSA(16) -AOM_SAD_8xHEIGHTx4D_MSA(16) -AOM_AVGSAD_8xHEIGHT_MSA(16) - -// 8x8 -AOM_SAD_8xHEIGHT_MSA(8) -AOM_SAD_8xHEIGHTx4D_MSA(8) -AOM_AVGSAD_8xHEIGHT_MSA(8) - -// 8x4 -AOM_SAD_8xHEIGHT_MSA(4) -AOM_SAD_8xHEIGHTx4D_MSA(4) -AOM_AVGSAD_8xHEIGHT_MSA(4) - -// 4x8 -AOM_SAD_4xHEIGHT_MSA(8) -AOM_SAD_4xHEIGHTx4D_MSA(8) -AOM_AVGSAD_4xHEIGHT_MSA(8) - -// 4x4 -AOM_SAD_4xHEIGHT_MSA(4) -AOM_SAD_4xHEIGHTx4D_MSA(4) -AOM_AVGSAD_4xHEIGHT_MSA(4) - /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c deleted file mode 100644 index 810b6efaa..000000000 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ /dev/null @@ -1,1792 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/variance.h" - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ - } - -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) - -#define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) - -static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t height, - int32_t *diff) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 pred, src = { 0 }; - v16u8 ref = { 0 }; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t height, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src, ref, pred; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - - pred = LD_UB(sec_pred); - sec_pred += 16; - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - src = __msa_aver_u_b(src, pred); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1, pred0, pred1; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - const uint8_t *ref_ptr, - int32_t ref_stride, - const uint8_t *sec_pred, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v8i16 avg2 = { 0 }; - v8i16 avg3 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 32; ht_cnt--;) { - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - - LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); - sec_pred += 64; - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - vec += __msa_hadd_s_w(avg2, avg2); - vec += __msa_hadd_s_w(avg3, avg3); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_4width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - CALC_MSE_AVG_B(src0, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 filt0, out, ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, - src2, src3); - CALC_MSE_AVG_B(src0, dst0, var, avg); - CALC_MSE_AVG_B(src1, dst1, var, avg); - CALC_MSE_AVG_B(src2, dst2, var, avg); - CALC_MSE_AVG_B(src3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4, out; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 ref = { 0 }; - v16u8 src2110, src4332; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, - filter, height, &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out, ref = { 0 }; - v16u8 filt_vt, filt_hz, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1; - v8u16 tmp0, tmp1, tmp2, tmp3; - v16u8 filt_vt, filt_hz, vec0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3; - v8u16 tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - CALC_MSE_AVG_B(src2, ref2, var, avg); - CALC_MSE_AVG_B(src3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height, - &diff0[loop_cnt]); - src += 16; - dst += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 out, pred, filt0, ref = { 0 }; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); - out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 out, pred, filt0; - v16u8 ref0, ref1, ref2, ref3; - v16i8 src0, src1, src2, src3; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 vec0, vec1, vec2, vec3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, - src2, src3); - out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); - - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref0, var, avg); - out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); - pred = LD_UB(sec_pred); - sec_pred += 16; - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v16u8 dst0, dst1, dst2, dst3; - v16u8 tmp0, tmp1, tmp2, tmp3; - v16u8 pred0, pred1, pred2, pred3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - dst += (4 * dst_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, - tmp2, tmp3); - AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, - tmp2, tmp3); - - CALC_MSE_AVG_B(tmp0, dst0, var, avg); - CALC_MSE_AVG_B(tmp1, dst1, var, avg); - CALC_MSE_AVG_B(tmp2, dst2, var, avg); - CALC_MSE_AVG_B(tmp3, dst3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 src10_r, src32_r, src21_r, src43_r; - v16u8 out, pred, ref = { 0 }; - v16u8 src2110, src4332, filt0; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - v8u16 tmp0, tmp1; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, filt0; - v8u16 vec0, vec1, vec2, vec3; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); - AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 src0, src1, src2, src3, src4; - v16u8 out0, out1, out2, out3, filt0; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter); - filt0 = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); - - src0 = src4; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, - out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, - int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += - subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, - filter, height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 out, pred, ref = { 0 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - pred = LD_UB(sec_pred); - sec_pred += 16; - LW4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, pred); - CALC_MSE_AVG_B(out, ref, var, avg); - src0 = src4; - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 ref0, ref1, ref2, ref3; - v16u8 src0, src1, src2, src3, src4; - v16u8 pred0, pred1, out0, out1; - v16u8 filt_hz, filt_vt, vec0; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - src0 = LD_UB(src); - src += src_stride; - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - LD_UB2(sec_pred, 16, pred0, pred1); - sec_pred += 32; - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); - AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t subpel_avg_ssediff_16w_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { - int16_t filtval; - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 ref0, ref1, ref2, ref3; - v16u8 pred0, pred1, pred2, pred3; - v16u8 out0, out1, out2, out3; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - filtval = LH(filter_horiz); - filt_hz = (v16u8)__msa_fill_h(filtval); - filtval = LH(filter_vert); - filt_vt = (v16u8)__msa_fill_h(filtval); - - LD_UB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); - sec_pred += (4 * width); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - - LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); - dst += (4 * dst_stride); - - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, - out2, out3); - - CALC_MSE_AVG_B(out0, ref0, var, avg); - CALC_MSE_AVG_B(out1, ref1, var, avg); - CALC_MSE_AVG_B(out2, ref2, var, avg); - CALC_MSE_AVG_B(out3, ref3, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, diff, 16); -} - -static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[2]; - - for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 32); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1]; - - return sse; -} - -static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, const uint8_t *dst, - int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, - const uint8_t *filter_vert, int32_t height, int32_t *diff) { - uint32_t loop_cnt, sse = 0; - int32_t diff0[4]; - - for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, - sec_pred, filter_horiz, filter_vert, - height, &diff0[loop_cnt], 64); - src += 16; - dst += 16; - sec_pred += 16; - } - - *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; - - return sse; -} - -#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); -#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); -#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); - -#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); - -#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sse) { \ - int32_t diff; \ - uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ - src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ - src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ - } \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ - src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ - sse); \ - } \ - } \ - \ - return var; \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) - -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) -AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) -/* clang-format on */ - -#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ - &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ - &diff); \ - } else { \ - *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ - ref_stride, sec_pred, ht, &diff); \ - } \ - } \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) - -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) -AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) -/* clang-format on */ - -uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, - int32_t src_stride, - int32_t xoffset, int32_t yoffset, - const uint8_t *ref_ptr, - int32_t ref_stride, uint32_t *sse, - const uint8_t *sec_pred) { - int32_t diff; - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; - - if (yoffset) { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_hv_msa( - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, - v_filter, 64, &diff); - } else { - *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, - ref_stride, sec_pred, - v_filter, 64, &diff); - } - } else { - if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, - ref_stride, sec_pred, - h_filter, 64, &diff); - } else { - *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, - sec_pred, &diff); - } - } - - return VARIANCE_32Wx64H(*sse, diff); -} - -#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ - uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ - &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ - src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ - &diff); \ - } else { \ - *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ - ref_stride, sec_pred, &diff); \ - } \ - } \ - \ - return VARIANCE_64Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) -AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) -/* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c deleted file mode 100644 index bfed773ac..000000000 --- a/third_party/aom/aom_dsp/mips/subtract_msa.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t src0, src1, src2, src3; - uint32_t pred0, pred1, pred2, pred3; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - LW4(src_ptr, src_stride, src0, src1, src2, src3); - LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); - INSERT_W4_SB(src0, src1, src2, src3, src); - INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); -} - -static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t loop_cnt; - uint64_t src0, src1, pred0, pred1; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 4; loop_cnt--;) { - LD2(src_ptr, src_stride, src0, src1); - src_ptr += (2 * src_stride); - LD2(pred_ptr, pred_stride, pred0, pred1); - pred_ptr += (2 * pred_stride); - - INSERT_D2_SB(src0, src1, src); - INSERT_D2_SB(pred0, pred1, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff_ptr, diff_stride); - diff_ptr += (2 * diff_stride); - } -} - -static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - int8_t count; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (count = 2; count--;) { - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, - pred7); - pred += (8 * pred_stride); - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - } -} - -static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 8; loop_cnt--;) { - LD_SB2(src, 16, src0, src1); - src += src_stride; - LD_SB2(src, 16, src2, src3); - src += src_stride; - LD_SB2(src, 16, src4, src5); - src += src_stride; - LD_SB2(src, 16, src6, src7); - src += src_stride; - - LD_SB2(pred, 16, pred0, pred1); - pred += pred_stride; - LD_SB2(pred, 16, pred2, pred3); - pred += pred_stride; - LD_SB2(pred, 16, pred4, pred5); - pred += pred_stride; - LD_SB2(pred, 16, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - } -} - -static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 32; loop_cnt--;) { - LD_SB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_SB4(src, 16, src4, src5, src6, src7); - src += src_stride; - - LD_SB4(pred, 16, pred0, pred1, pred2, pred3); - pred += pred_stride; - LD_SB4(pred, 16, pred4, pred5, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - } -} - -void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, - ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, - ptrdiff_t pred_stride) { - if (rows == cols) { - switch (rows) { - case 4: - sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 8: - sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 16: - sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 32: - sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - case 64: - sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, - diff_stride); - break; - default: - aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - } - } else { - aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, - pred_ptr, pred_stride); - } -} diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c deleted file mode 100644 index 065c09ac5..000000000 --- a/third_party/aom/aom_dsp/mips/variance_msa.c +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/mips/macros_msa.h" - -#define CALC_MSE_B(src, ref, var) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - } - -#define CALC_MSE_AVG_B(src, ref, var, sub) \ - { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ - } - -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) - -#define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) - -static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - int32_t ht_cnt; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src, ref; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src, ref, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8i16 avg = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg); - CALC_MSE_AVG_B(src1, ref1, var, avg); - } - - vec = __msa_hadd_s_w(avg, avg); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 16; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src2, ref2, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src3, ref3, var, avg1); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t *diff) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v8i16 avg0 = { 0 }; - v8i16 avg1 = { 0 }; - v8i16 avg2 = { 0 }; - v8i16 avg3 = { 0 }; - v4i32 vec, var = { 0 }; - - for (ht_cnt = 32; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_AVG_B(src0, ref0, var, avg0); - CALC_MSE_AVG_B(src1, ref1, var, avg1); - CALC_MSE_AVG_B(src2, ref2, var, avg2); - CALC_MSE_AVG_B(src3, ref3, var, avg3); - } - - vec = __msa_hadd_s_w(avg0, avg0); - vec += __msa_hadd_s_w(avg1, avg1); - vec += __msa_hadd_s_w(avg2, avg2); - vec += __msa_hadd_s_w(avg3, avg3); - *diff = HADD_SW_S32(vec); - - return HADD_SW_S32(var); -} - -static uint32_t get_mb_ss_msa(const int16_t *src) { - uint32_t sum, cnt; - v8i16 src0, src1, src2, src3; - v4i32 src0_l, src1_l, src2_l, src3_l; - v4i32 src0_r, src1_r, src2_r, src3_r; - v2i64 sq_src_l = { 0 }; - v2i64 sq_src_r = { 0 }; - - for (cnt = 8; cnt--;) { - LD_SH4(src, 8, src0, src1, src2, src3); - src += 4 * 8; - - UNPCK_SH_SW(src0, src0_l, src0_r); - UNPCK_SH_SW(src1, src1_l, src1_r); - UNPCK_SH_SW(src2, src2_l, src2_r); - UNPCK_SH_SW(src3, src3_l, src3_r); - - DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); - DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); - } - - sq_src_l += __msa_splati_d(sq_src_l, 1); - sq_src_r += __msa_splati_d(sq_src_r, 1); - - sum = __msa_copy_s_d(sq_src_l, 0); - sum += __msa_copy_s_d(sq_src_r, 0); - - return sum; -} - -static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - INSERT_W4_UB(src0, src1, src2, src3, src); - INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - CALC_MSE_B(src, ref, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, - ref0, ref1); - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src, ref; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - ref = LD_UB(ref_ptr); - ref_ptr += ref_stride; - CALC_MSE_B(src, ref, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, ref0, ref1; - v4i32 var = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - - LD_UB2(src_ptr, 16, src0, src1); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src1, ref1, var); - } - - return HADD_SW_S32(var); -} - -static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3; - v4i32 var = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src2, ref2, var); - CALC_MSE_B(src1, ref1, var); - CALC_MSE_B(src3, ref3, var); - - LD_UB4(src_ptr, 16, src0, src1, src2, src3); - src_ptr += src_stride; - LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); - ref_ptr += ref_stride; - CALC_MSE_B(src0, ref0, var); - CALC_MSE_B(src2, ref2, var); - CALC_MSE_B(src1, ref1, var); - CALC_MSE_B(src3, ref3, var); - } - - return HADD_SW_S32(var); -} - -uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride) { - uint32_t err = 0; - uint32_t src0, src1, src2, src3; - uint32_t ref0, ref1, ref2, ref3; - v16i8 src = { 0 }; - v16i8 ref = { 0 }; - v16u8 src_vec0, src_vec1; - v8i16 diff0, diff1; - v4i32 err0 = { 0 }; - v4i32 err1 = { 0 }; - - LW4(src_ptr, src_stride, src0, src1, src2, src3); - LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - INSERT_W4_SB(src0, src1, src2, src3, src); - INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); - ILVRL_B2_UB(src, ref, src_vec0, src_vec1); - HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); - DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); - err = HADD_SW_S32(err0); - err += HADD_SW_S32(err1); - - return err; -} - -#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); -#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); -#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); -#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); -#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); - -#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); -#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); -#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); -#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); - -#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ - uint32_t aom_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ - int32_t ref_stride, uint32_t *sse) { \ - int32_t diff; \ - \ - *sse = \ - sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } - -/* clang-format off */ -AOM_VARIANCE_WDXHT_MSA(4, 4) -AOM_VARIANCE_WDXHT_MSA(4, 8) - -AOM_VARIANCE_WDXHT_MSA(8, 4) -AOM_VARIANCE_WDXHT_MSA(8, 8) -AOM_VARIANCE_WDXHT_MSA(8, 16) - -AOM_VARIANCE_WDXHT_MSA(16, 8) -AOM_VARIANCE_WDXHT_MSA(16, 16) -AOM_VARIANCE_WDXHT_MSA(16, 32) - -AOM_VARIANCE_WDXHT_MSA(32, 16) -AOM_VARIANCE_WDXHT_MSA(32, 32) -/* clang-format on */ - -uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_32Wx64H(*sse, diff); -} - -uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_64Wx32H(*sse, diff); -} - -uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - int32_t diff; - - *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); - - return VARIANCE_64Wx64H(*sse, diff); -} - -uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { - *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); - - return *sse; -} - -uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); - - return *sse; -} - -uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); - - return *sse; -} - -uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { - *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); - - return *sse; -} - -void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse, - int32_t *sum) { - *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); -} - -void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, uint32_t *sse, - int32_t *sum) { - *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); -} - -uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c deleted file mode 100644 index 2faee8506..000000000 --- a/third_party/aom/aom_dsp/noise_model.c +++ /dev/null @@ -1,1648 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <math.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/noise_model.h" -#include "aom_dsp/noise_util.h" -#include "aom_mem/aom_mem.h" -#include "av1/common/common.h" -#include "av1/encoder/mathutils.h" - -#define kLowPolyNumParams 3 - -static const int kMaxLag = 4; - -// Defines a function that can be used to obtain the mean of a block for the -// provided data type (uint8_t, or uint16_t) -#define GET_BLOCK_MEAN(INT_TYPE, suffix) \ - static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \ - int stride, int x_o, int y_o, \ - int block_size) { \ - const int max_h = AOMMIN(h - y_o, block_size); \ - const int max_w = AOMMIN(w - x_o, block_size); \ - double block_mean = 0; \ - for (int y = 0; y < max_h; ++y) { \ - for (int x = 0; x < max_w; ++x) { \ - block_mean += data[(y_o + y) * stride + x_o + x]; \ - } \ - } \ - return block_mean / (max_w * max_h); \ - } - -GET_BLOCK_MEAN(uint8_t, lowbd); -GET_BLOCK_MEAN(uint16_t, highbd); - -static INLINE double get_block_mean(const uint8_t *data, int w, int h, - int stride, int x_o, int y_o, - int block_size, int use_highbd) { - if (use_highbd) - return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o, - block_size); - return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size); -} - -// Defines a function that can be used to obtain the variance of a block -// for the provided data type (uint8_t, or uint16_t) -#define GET_NOISE_VAR(INT_TYPE, suffix) \ - static double get_noise_var_##suffix( \ - const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \ - int h, int x_o, int y_o, int block_size_x, int block_size_y) { \ - const int max_h = AOMMIN(h - y_o, block_size_y); \ - const int max_w = AOMMIN(w - x_o, block_size_x); \ - double noise_var = 0; \ - double noise_mean = 0; \ - for (int y = 0; y < max_h; ++y) { \ - for (int x = 0; x < max_w; ++x) { \ - double noise = (double)data[(y_o + y) * stride + x_o + x] - \ - denoised[(y_o + y) * stride + x_o + x]; \ - noise_mean += noise; \ - noise_var += noise * noise; \ - } \ - } \ - noise_mean /= (max_w * max_h); \ - return noise_var / (max_w * max_h) - noise_mean * noise_mean; \ - } - -GET_NOISE_VAR(uint8_t, lowbd); -GET_NOISE_VAR(uint16_t, highbd); - -static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised, - int w, int h, int stride, int x_o, int y_o, - int block_size_x, int block_size_y, - int use_highbd) { - if (use_highbd) - return get_noise_var_highbd((const uint16_t *)data, - (const uint16_t *)denoised, w, h, stride, x_o, - y_o, block_size_x, block_size_y); - return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o, - block_size_x, block_size_y); -} - -static void equation_system_clear(aom_equation_system_t *eqns) { - const int n = eqns->n; - memset(eqns->A, 0, sizeof(*eqns->A) * n * n); - memset(eqns->x, 0, sizeof(*eqns->x) * n); - memset(eqns->b, 0, sizeof(*eqns->b) * n); -} - -static void equation_system_copy(aom_equation_system_t *dst, - const aom_equation_system_t *src) { - const int n = dst->n; - memcpy(dst->A, src->A, sizeof(*dst->A) * n * n); - memcpy(dst->x, src->x, sizeof(*dst->x) * n); - memcpy(dst->b, src->b, sizeof(*dst->b) * n); -} - -static int equation_system_init(aom_equation_system_t *eqns, int n) { - eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n); - eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n); - eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n); - eqns->n = n; - if (!eqns->A || !eqns->b || !eqns->x) { - fprintf(stderr, "Failed to allocate system of equations of size %d\n", n); - aom_free(eqns->A); - aom_free(eqns->b); - aom_free(eqns->x); - memset(eqns, 0, sizeof(*eqns)); - return 0; - } - equation_system_clear(eqns); - return 1; -} - -static int equation_system_solve(aom_equation_system_t *eqns) { - const int n = eqns->n; - double *b = (double *)aom_malloc(sizeof(*b) * n); - double *A = (double *)aom_malloc(sizeof(*A) * n * n); - int ret = 0; - if (A == NULL || b == NULL) { - fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n); - aom_free(b); - aom_free(A); - return 0; - } - memcpy(A, eqns->A, sizeof(*eqns->A) * n * n); - memcpy(b, eqns->b, sizeof(*eqns->b) * n); - ret = linsolve(n, A, eqns->n, b, eqns->x); - aom_free(b); - aom_free(A); - - if (ret == 0) { - return 0; - } - return 1; -} - -static void equation_system_add(aom_equation_system_t *dest, - aom_equation_system_t *src) { - const int n = dest->n; - int i, j; - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) { - dest->A[i * n + j] += src->A[i * n + j]; - } - dest->b[i] += src->b[i]; - } -} - -static void equation_system_free(aom_equation_system_t *eqns) { - if (!eqns) return; - aom_free(eqns->A); - aom_free(eqns->b); - aom_free(eqns->x); - memset(eqns, 0, sizeof(*eqns)); -} - -static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) { - equation_system_clear(&solver->eqns); - solver->num_equations = 0; - solver->total = 0; -} - -static void noise_strength_solver_add(aom_noise_strength_solver_t *dest, - aom_noise_strength_solver_t *src) { - equation_system_add(&dest->eqns, &src->eqns); - dest->num_equations += src->num_equations; - dest->total += src->total; -} - -// Return the number of coefficients required for the given parameters -static int num_coeffs(const aom_noise_model_params_t params) { - const int n = 2 * params.lag + 1; - switch (params.shape) { - case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1); - case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2; - } - return 0; -} - -static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) { - const int kNumBins = 20; - if (!equation_system_init(&state->eqns, n)) { - fprintf(stderr, "Failed initialization noise state with size %d\n", n); - return 0; - } - state->ar_gain = 1.0; - state->num_observations = 0; - return aom_noise_strength_solver_init(&state->strength_solver, kNumBins, - bit_depth); -} - -static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { - const double kTolerance = 1e-6; - const int last = eqns->n - 1; - // Set all of the AR coefficients to zero, but try to solve for correlation - // with the luma channel - memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n); - if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) { - eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last]; - } -} - -int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { - if (!lut) return 0; - lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); - if (!lut->points) return 0; - lut->num_points = num_points; - memset(lut->points, 0, sizeof(*lut->points) * num_points); - return 1; -} - -void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) { - if (!lut) return; - aom_free(lut->points); - memset(lut, 0, sizeof(*lut)); -} - -double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, - double x) { - int i = 0; - // Constant extrapolation for x < x_0. - if (x < lut->points[0][0]) return lut->points[0][1]; - for (i = 0; i < lut->num_points - 1; ++i) { - if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) { - const double a = - (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]); - return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a); - } - } - // Constant extrapolation for x > x_{n-1} - return lut->points[lut->num_points - 1][1]; -} - -static double noise_strength_solver_get_bin_index( - const aom_noise_strength_solver_t *solver, double value) { - const double val = - fclamp(value, solver->min_intensity, solver->max_intensity); - const double range = solver->max_intensity - solver->min_intensity; - return (solver->num_bins - 1) * (val - solver->min_intensity) / range; -} - -static double noise_strength_solver_get_value( - const aom_noise_strength_solver_t *solver, double x) { - const double bin = noise_strength_solver_get_bin_index(solver, x); - const int bin_i0 = (int)floor(bin); - const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); - const double a = bin - bin_i0; - return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1]; -} - -void aom_noise_strength_solver_add_measurement( - aom_noise_strength_solver_t *solver, double block_mean, double noise_std) { - const double bin = noise_strength_solver_get_bin_index(solver, block_mean); - const int bin_i0 = (int)floor(bin); - const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); - const double a = bin - bin_i0; - const int n = solver->num_bins; - solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a); - solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a); - solver->eqns.A[bin_i1 * n + bin_i1] += a * a; - solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a); - solver->eqns.b[bin_i0] += (1.0 - a) * noise_std; - solver->eqns.b[bin_i1] += a * noise_std; - solver->total += noise_std; - solver->num_equations++; -} - -int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) { - // Add regularization proportional to the number of constraints - const int n = solver->num_bins; - const double kAlpha = 2.0 * (double)(solver->num_equations) / n; - int result = 0; - double mean = 0; - - // Do this in a non-destructive manner so it is not confusing to the caller - double *old_A = solver->eqns.A; - double *A = (double *)aom_malloc(sizeof(*A) * n * n); - if (!A) { - fprintf(stderr, "Unable to allocate copy of A\n"); - return 0; - } - memcpy(A, old_A, sizeof(*A) * n * n); - - for (int i = 0; i < n; ++i) { - const int i_lo = AOMMAX(0, i - 1); - const int i_hi = AOMMIN(n - 1, i + 1); - A[i * n + i_lo] -= kAlpha; - A[i * n + i] += 2 * kAlpha; - A[i * n + i_hi] -= kAlpha; - } - - // Small regularization to give average noise strength - mean = solver->total / solver->num_equations; - for (int i = 0; i < n; ++i) { - A[i * n + i] += 1.0 / 8192.; - solver->eqns.b[i] += mean / 8192.; - } - solver->eqns.A = A; - result = equation_system_solve(&solver->eqns); - solver->eqns.A = old_A; - - aom_free(A); - return result; -} - -int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, - int num_bins, int bit_depth) { - if (!solver) return 0; - memset(solver, 0, sizeof(*solver)); - solver->num_bins = num_bins; - solver->min_intensity = 0; - solver->max_intensity = (1 << bit_depth) - 1; - solver->total = 0; - solver->num_equations = 0; - return equation_system_init(&solver->eqns, num_bins); -} - -void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) { - if (!solver) return; - equation_system_free(&solver->eqns); -} - -double aom_noise_strength_solver_get_center( - const aom_noise_strength_solver_t *solver, int i) { - const double range = solver->max_intensity - solver->min_intensity; - const int n = solver->num_bins; - return ((double)i) / (n - 1) * range + solver->min_intensity; -} - -// Computes the residual if a point were to be removed from the lut. This is -// calculated as the area between the output of the solver and the line segment -// that would be formed between [x_{i - 1}, x_{i + 1}). -static void update_piecewise_linear_residual( - const aom_noise_strength_solver_t *solver, - const aom_noise_strength_lut_t *lut, double *residual, int start, int end) { - const double dx = 255. / solver->num_bins; - for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) { - const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index( - solver, lut->points[i - 1][0]))); - const int upper = AOMMIN(solver->num_bins - 1, - (int)ceil(noise_strength_solver_get_bin_index( - solver, lut->points[i + 1][0]))); - double r = 0; - for (int j = lower; j <= upper; ++j) { - const double x = aom_noise_strength_solver_get_center(solver, j); - if (x < lut->points[i - 1][0]) continue; - if (x >= lut->points[i + 1][0]) continue; - const double y = solver->eqns.x[j]; - const double a = (x - lut->points[i - 1][0]) / - (lut->points[i + 1][0] - lut->points[i - 1][0]); - const double estimate_y = - lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a; - r += fabs(y - estimate_y); - } - residual[i] = r * dx; - } -} - -int aom_noise_strength_solver_fit_piecewise( - const aom_noise_strength_solver_t *solver, int max_output_points, - aom_noise_strength_lut_t *lut) { - // The tolerance is normalized to be give consistent results between - // different bit-depths. - const double kTolerance = solver->max_intensity * 0.00625 / 255.0; - if (!aom_noise_strength_lut_init(lut, solver->num_bins)) { - fprintf(stderr, "Failed to init lut\n"); - return 0; - } - for (int i = 0; i < solver->num_bins; ++i) { - lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i); - lut->points[i][1] = solver->eqns.x[i]; - } - if (max_output_points < 0) { - max_output_points = solver->num_bins; - } - - double *residual = aom_malloc(solver->num_bins * sizeof(*residual)); - memset(residual, 0, sizeof(*residual) * solver->num_bins); - - update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins); - - // Greedily remove points if there are too many or if it doesn't hurt local - // approximation (never remove the end points) - while (lut->num_points > 2) { - int min_index = 1; - for (int j = 1; j < lut->num_points - 1; ++j) { - if (residual[j] < residual[min_index]) { - min_index = j; - } - } - const double dx = - lut->points[min_index + 1][0] - lut->points[min_index - 1][0]; - const double avg_residual = residual[min_index] / dx; - if (lut->num_points <= max_output_points && avg_residual > kTolerance) { - break; - } - - const int num_remaining = lut->num_points - min_index - 1; - memmove(lut->points + min_index, lut->points + min_index + 1, - sizeof(lut->points[0]) * num_remaining); - lut->num_points--; - - update_piecewise_linear_residual(solver, lut, residual, min_index - 1, - min_index + 1); - } - aom_free(residual); - return 1; -} - -int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, - int block_size, int bit_depth, int use_highbd) { - const int n = block_size * block_size; - aom_equation_system_t eqns; - double *AtA_inv = 0; - double *A = 0; - int x = 0, y = 0, i = 0, j = 0; - if (!equation_system_init(&eqns, kLowPolyNumParams)) { - fprintf(stderr, "Failed to init equation system for block_size=%d\n", - block_size); - return 0; - } - - AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams * - sizeof(*AtA_inv)); - A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A)); - if (AtA_inv == NULL || A == NULL) { - fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n", - block_size); - aom_free(AtA_inv); - aom_free(A); - equation_system_free(&eqns); - return 0; - } - - block_finder->A = A; - block_finder->AtA_inv = AtA_inv; - block_finder->block_size = block_size; - block_finder->normalization = (1 << bit_depth) - 1; - block_finder->use_highbd = use_highbd; - - for (y = 0; y < block_size; ++y) { - const double yd = ((double)y - block_size / 2.) / (block_size / 2.); - for (x = 0; x < block_size; ++x) { - const double xd = ((double)x - block_size / 2.) / (block_size / 2.); - const double coords[3] = { yd, xd, 1 }; - const int row = y * block_size + x; - A[kLowPolyNumParams * row + 0] = yd; - A[kLowPolyNumParams * row + 1] = xd; - A[kLowPolyNumParams * row + 2] = 1; - - for (i = 0; i < kLowPolyNumParams; ++i) { - for (j = 0; j < kLowPolyNumParams; ++j) { - eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j]; - } - } - } - } - - // Lazy inverse using existing equation solver. - for (i = 0; i < kLowPolyNumParams; ++i) { - memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams); - eqns.b[i] = 1; - equation_system_solve(&eqns); - - for (j = 0; j < kLowPolyNumParams; ++j) { - AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j]; - } - } - equation_system_free(&eqns); - return 1; -} - -void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) { - if (!block_finder) return; - aom_free(block_finder->A); - aom_free(block_finder->AtA_inv); - memset(block_finder, 0, sizeof(*block_finder)); -} - -void aom_flat_block_finder_extract_block( - const aom_flat_block_finder_t *block_finder, const uint8_t *const data, - int w, int h, int stride, int offsx, int offsy, double *plane, - double *block) { - const int block_size = block_finder->block_size; - const int n = block_size * block_size; - const double *A = block_finder->A; - const double *AtA_inv = block_finder->AtA_inv; - double plane_coords[kLowPolyNumParams]; - double AtA_inv_b[kLowPolyNumParams]; - int xi, yi, i; - - if (block_finder->use_highbd) { - const uint16_t *const data16 = (const uint16_t *const)data; - for (yi = 0; yi < block_size; ++yi) { - const int y = clamp(offsy + yi, 0, h - 1); - for (xi = 0; xi < block_size; ++xi) { - const int x = clamp(offsx + xi, 0, w - 1); - block[yi * block_size + xi] = - ((double)data16[y * stride + x]) / block_finder->normalization; - } - } - } else { - for (yi = 0; yi < block_size; ++yi) { - const int y = clamp(offsy + yi, 0, h - 1); - for (xi = 0; xi < block_size; ++xi) { - const int x = clamp(offsx + xi, 0, w - 1); - block[yi * block_size + xi] = - ((double)data[y * stride + x]) / block_finder->normalization; - } - } - } - multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams); - multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams, - kLowPolyNumParams, 1); - multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1); - - for (i = 0; i < n; ++i) { - block[i] -= plane[i]; - } -} - -typedef struct { - int index; - float score; -} index_and_score_t; - -static int compare_scores(const void *a, const void *b) { - const float diff = - ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score; - if (diff < 0) - return -1; - else if (diff > 0) - return 1; - return 0; -} - -int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, - const uint8_t *const data, int w, int h, - int stride, uint8_t *flat_blocks) { - // The gradient-based features used in this code are based on: - // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise - // correlation for improved video denoising," 2012 19th, ICIP. - // The thresholds are more lenient to allow for correct grain modeling - // if extreme cases. - const int block_size = block_finder->block_size; - const int n = block_size * block_size; - const double kTraceThreshold = 0.15 / (32 * 32); - const double kRatioThreshold = 1.25; - const double kNormThreshold = 0.08 / (32 * 32); - const double kVarThreshold = 0.005 / (double)n; - const int num_blocks_w = (w + block_size - 1) / block_size; - const int num_blocks_h = (h + block_size - 1) / block_size; - int num_flat = 0; - int bx = 0, by = 0; - double *plane = (double *)aom_malloc(n * sizeof(*plane)); - double *block = (double *)aom_malloc(n * sizeof(*block)); - index_and_score_t *scores = (index_and_score_t *)aom_malloc( - num_blocks_w * num_blocks_h * sizeof(*scores)); - if (plane == NULL || block == NULL || scores == NULL) { - fprintf(stderr, "Failed to allocate memory for block of size %d\n", n); - aom_free(plane); - aom_free(block); - aom_free(scores); - return -1; - } - -#ifdef NOISE_MODEL_LOG_SCORE - fprintf(stderr, "score = ["); -#endif - for (by = 0; by < num_blocks_h; ++by) { - for (bx = 0; bx < num_blocks_w; ++bx) { - // Compute gradient covariance matrix. - double Gxx = 0, Gxy = 0, Gyy = 0; - double var = 0; - double mean = 0; - int xi, yi; - aom_flat_block_finder_extract_block(block_finder, data, w, h, stride, - bx * block_size, by * block_size, - plane, block); - - for (yi = 1; yi < block_size - 1; ++yi) { - for (xi = 1; xi < block_size - 1; ++xi) { - const double gx = (block[yi * block_size + xi + 1] - - block[yi * block_size + xi - 1]) / - 2; - const double gy = (block[yi * block_size + xi + block_size] - - block[yi * block_size + xi - block_size]) / - 2; - Gxx += gx * gx; - Gxy += gx * gy; - Gyy += gy * gy; - - mean += block[yi * block_size + xi]; - var += block[yi * block_size + xi] * block[yi * block_size + xi]; - } - } - mean /= (block_size - 2) * (block_size - 2); - - // Normalize gradients by block_size. - Gxx /= ((block_size - 2) * (block_size - 2)); - Gxy /= ((block_size - 2) * (block_size - 2)); - Gyy /= ((block_size - 2) * (block_size - 2)); - var = var / ((block_size - 2) * (block_size - 2)) - mean * mean; - - { - const double trace = Gxx + Gyy; - const double det = Gxx * Gyy - Gxy * Gxy; - const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.; - const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.; - const double norm = e1; // Spectral norm - const double ratio = (e1 / AOMMAX(e2, 1e-6)); - const int is_flat = (trace < kTraceThreshold) && - (ratio < kRatioThreshold) && - (norm < kNormThreshold) && (var > kVarThreshold); - // The following weights are used to combine the above features to give - // a sigmoid score for flatness. If the input was normalized to [0,100] - // the magnitude of these values would be close to 1 (e.g., weights - // corresponding to variance would be a factor of 10000x smaller). - // The weights are given in the following order: - // [{var}, {ratio}, {trace}, {norm}, offset] - // with one of the most discriminative being simply the variance. - const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 }; - const float score = - (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio + - weights[2] * trace + weights[3] * norm + - weights[4])))); - flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0; - scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0; - scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx; -#ifdef NOISE_MODEL_LOG_SCORE - fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm, - is_flat); -#endif - num_flat += is_flat; - } - } -#ifdef NOISE_MODEL_LOG_SCORE - fprintf(stderr, "\n"); -#endif - } -#ifdef NOISE_MODEL_LOG_SCORE - fprintf(stderr, "];\n"); -#endif - // Find the top-scored blocks (most likely to be flat) and set the flat blocks - // be the union of the thresholded results and the top 10th percentile of the - // scored results. - qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores); - const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100; - const float score_threshold = scores[top_nth_percentile].score; - for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) { - if (scores[i].score >= score_threshold) { - num_flat += flat_blocks[scores[i].index] == 0; - flat_blocks[scores[i].index] |= 1; - } - } - aom_free(block); - aom_free(plane); - aom_free(scores); - return num_flat; -} - -int aom_noise_model_init(aom_noise_model_t *model, - const aom_noise_model_params_t params) { - const int n = num_coeffs(params); - const int lag = params.lag; - const int bit_depth = params.bit_depth; - int x = 0, y = 0, i = 0, c = 0; - - memset(model, 0, sizeof(*model)); - if (params.lag < 1) { - fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag); - return 0; - } - if (params.lag > kMaxLag) { - fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag, - kMaxLag); - return 0; - } - - memcpy(&model->params, ¶ms, sizeof(params)); - for (c = 0; c < 3; ++c) { - if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) { - fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); - aom_noise_model_free(model); - return 0; - } - if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) { - fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); - aom_noise_model_free(model); - return 0; - } - } - model->n = n; - model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n); - - for (y = -lag; y <= 0; ++y) { - const int max_x = y == 0 ? -1 : lag; - for (x = -lag; x <= max_x; ++x) { - switch (params.shape) { - case AOM_NOISE_SHAPE_DIAMOND: - if (abs(x) <= y + lag) { - model->coords[i][0] = x; - model->coords[i][1] = y; - ++i; - } - break; - case AOM_NOISE_SHAPE_SQUARE: - model->coords[i][0] = x; - model->coords[i][1] = y; - ++i; - break; - default: - fprintf(stderr, "Invalid shape\n"); - aom_noise_model_free(model); - return 0; - } - } - } - assert(i == n); - return 1; -} - -void aom_noise_model_free(aom_noise_model_t *model) { - int c = 0; - if (!model) return; - - aom_free(model->coords); - for (c = 0; c < 3; ++c) { - equation_system_free(&model->latest_state[c].eqns); - equation_system_free(&model->combined_state[c].eqns); - - equation_system_free(&model->latest_state[c].strength_solver.eqns); - equation_system_free(&model->combined_state[c].strength_solver.eqns); - } - memset(model, 0, sizeof(*model)); -} - -// Extracts the neighborhood defined by coords around point (x, y) from -// the difference between the data and denoised images. Also extracts the -// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma). -#define EXTRACT_AR_ROW(INT_TYPE, suffix) \ - static double extract_ar_row_##suffix( \ - int(*coords)[2], int num_coords, const INT_TYPE *const data, \ - const INT_TYPE *const denoised, int stride, int sub_log2[2], \ - const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \ - int alt_stride, int x, int y, double *buffer) { \ - for (int i = 0; i < num_coords; ++i) { \ - const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \ - buffer[i] = \ - (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \ - } \ - const double val = \ - (double)data[y * stride + x] - denoised[y * stride + x]; \ - \ - if (alt_data && alt_denoised) { \ - double avg_data = 0, avg_denoised = 0; \ - int num_samples = 0; \ - for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \ - const int y_up = (y << sub_log2[1]) + dy_i; \ - for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \ - const int x_up = (x << sub_log2[0]) + dx_i; \ - avg_data += alt_data[y_up * alt_stride + x_up]; \ - avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \ - num_samples++; \ - } \ - } \ - buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \ - } \ - return val; \ - } - -EXTRACT_AR_ROW(uint8_t, lowbd); -EXTRACT_AR_ROW(uint16_t, highbd); - -static int add_block_observations( - aom_noise_model_t *noise_model, int c, const uint8_t *const data, - const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], - const uint8_t *const alt_data, const uint8_t *const alt_denoised, - int alt_stride, const uint8_t *const flat_blocks, int block_size, - int num_blocks_w, int num_blocks_h) { - const int lag = noise_model->params.lag; - const int num_coords = noise_model->n; - const double normalization = (1 << noise_model->params.bit_depth) - 1; - double *A = noise_model->latest_state[c].eqns.A; - double *b = noise_model->latest_state[c].eqns.b; - double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1)); - const int n = noise_model->latest_state[c].eqns.n; - - if (!buffer) { - fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1); - return 0; - } - for (int by = 0; by < num_blocks_h; ++by) { - const int y_o = by * (block_size >> sub_log2[1]); - for (int bx = 0; bx < num_blocks_w; ++bx) { - const int x_o = bx * (block_size >> sub_log2[0]); - if (!flat_blocks[by * num_blocks_w + bx]) { - continue; - } - int y_start = - (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag; - int x_start = - (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag; - int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), - block_size >> sub_log2[1]); - int x_end = AOMMIN( - (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag, - (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1]) - ? (block_size >> sub_log2[0]) - : ((block_size >> sub_log2[0]) - lag)); - for (int y = y_start; y < y_end; ++y) { - for (int x = x_start; x < x_end; ++x) { - const double val = - noise_model->params.use_highbd - ? extract_ar_row_highbd(noise_model->coords, num_coords, - (const uint16_t *const)data, - (const uint16_t *const)denoised, - stride, sub_log2, - (const uint16_t *const)alt_data, - (const uint16_t *const)alt_denoised, - alt_stride, x + x_o, y + y_o, buffer) - : extract_ar_row_lowbd(noise_model->coords, num_coords, data, - denoised, stride, sub_log2, alt_data, - alt_denoised, alt_stride, x + x_o, - y + y_o, buffer); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - A[i * n + j] += - (buffer[i] * buffer[j]) / (normalization * normalization); - } - b[i] += (buffer[i] * val) / (normalization * normalization); - } - noise_model->latest_state[c].num_observations++; - } - } - } - } - aom_free(buffer); - return 1; -} - -static void add_noise_std_observations( - aom_noise_model_t *noise_model, int c, const double *coeffs, - const uint8_t *const data, const uint8_t *const denoised, int w, int h, - int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride, - const uint8_t *const flat_blocks, int block_size, int num_blocks_w, - int num_blocks_h) { - const int num_coords = noise_model->n; - aom_noise_strength_solver_t *noise_strength_solver = - &noise_model->latest_state[c].strength_solver; - - const aom_noise_strength_solver_t *noise_strength_luma = - &noise_model->latest_state[0].strength_solver; - const double luma_gain = noise_model->latest_state[0].ar_gain; - const double noise_gain = noise_model->latest_state[c].ar_gain; - for (int by = 0; by < num_blocks_h; ++by) { - const int y_o = by * (block_size >> sub_log2[1]); - for (int bx = 0; bx < num_blocks_w; ++bx) { - const int x_o = bx * (block_size >> sub_log2[0]); - if (!flat_blocks[by * num_blocks_w + bx]) { - continue; - } - const int num_samples_h = - AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), - block_size >> sub_log2[1]); - const int num_samples_w = - AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]), - (block_size >> sub_log2[0])); - // Make sure that we have a reasonable amount of samples to consider the - // block - if (num_samples_w * num_samples_h > block_size) { - const double block_mean = get_block_mean( - alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride, - x_o << sub_log2[0], y_o << sub_log2[1], block_size, - noise_model->params.use_highbd); - const double noise_var = get_noise_var( - data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o, - y_o, block_size >> sub_log2[0], block_size >> sub_log2[1], - noise_model->params.use_highbd); - // We want to remove the part of the noise that came from being - // correlated with luma. Note that the noise solver for luma must - // have already been run. - const double luma_strength = - c > 0 ? luma_gain * noise_strength_solver_get_value( - noise_strength_luma, block_mean) - : 0; - const double corr = c > 0 ? coeffs[num_coords] : 0; - // Chroma noise: - // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2) - // The uncorrelated component: - // uncorr_var = noise_var - (corr * luma_strength)^2 - // But don't allow fully correlated noise (hence the max), since the - // synthesis cannot model it. - const double uncorr_std = sqrt( - AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2))); - // After we've removed correlation with luma, undo the gain that will - // come from running the IIR filter. - const double adjusted_strength = uncorr_std / noise_gain; - aom_noise_strength_solver_add_measurement( - noise_strength_solver, block_mean, adjusted_strength); - } - } - } -} - -// Return true if the noise estimate appears to be different from the combined -// (multi-frame) estimate. The difference is measured by checking whether the -// AR coefficients have diverged (using a threshold on normalized cross -// correlation), or whether the noise strength has changed. -static int is_noise_model_different(aom_noise_model_t *const noise_model) { - // These thresholds are kind of arbitrary and will likely need further tuning - // (or exported as parameters). The threshold on noise strength is a weighted - // difference between the noise strength histograms - const double kCoeffThreshold = 0.9; - const double kStrengthThreshold = - 0.005 * (1 << (noise_model->params.bit_depth - 8)); - for (int c = 0; c < 1; ++c) { - const double corr = - aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x, - noise_model->combined_state[c].eqns.x, - noise_model->combined_state[c].eqns.n); - if (corr < kCoeffThreshold) return 1; - - const double dx = - 1.0 / noise_model->latest_state[c].strength_solver.num_bins; - - const aom_equation_system_t *latest_eqns = - &noise_model->latest_state[c].strength_solver.eqns; - const aom_equation_system_t *combined_eqns = - &noise_model->combined_state[c].strength_solver.eqns; - double diff = 0; - double total_weight = 0; - for (int j = 0; j < latest_eqns->n; ++j) { - double weight = 0; - for (int i = 0; i < latest_eqns->n; ++i) { - weight += latest_eqns->A[i * latest_eqns->n + j]; - } - weight = sqrt(weight); - diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]); - total_weight += weight; - } - if (diff * dx / total_weight > kStrengthThreshold) return 1; - } - return 0; -} - -static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) { - const int ret = equation_system_solve(&state->eqns); - state->ar_gain = 1.0; - if (!ret) return ret; - - // Update the AR gain from the equation system as it will be used to fit - // the noise strength as a function of intensity. In the Yule-Walker - // equations, the diagonal should be the variance of the correlated noise. - // In the case of the least squares estimate, there will be some variability - // in the diagonal. So use the mean of the diagonal as the estimate of - // overall variance (this works for least squares or Yule-Walker formulation). - double var = 0; - const int n = state->eqns.n; - for (int i = 0; i < (state->eqns.n - is_chroma); ++i) { - var += state->eqns.A[i * n + i] / state->num_observations; - } - var /= (n - is_chroma); - - // Keep track of E(Y^2) = <b, x> + E(X^2) - // In the case that we are using chroma and have an estimate of correlation - // with luma we adjust that estimate slightly to remove the correlated bits by - // subtracting out the last column of a scaled by our correlation estimate - // from b. E(y^2) = <b - A(:, end)*x(end), x> - double sum_covar = 0; - for (int i = 0; i < state->eqns.n - is_chroma; ++i) { - double bi = state->eqns.b[i]; - if (is_chroma) { - bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1]; - } - sum_covar += (bi * state->eqns.x[i]) / state->num_observations; - } - // Now, get an estimate of the variance of uncorrelated noise signal and use - // it to determine the gain of the AR filter. - const double noise_var = AOMMAX(var - sum_covar, 1e-6); - state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6))); - return ret; -} - -aom_noise_status_t aom_noise_model_update( - aom_noise_model_t *const noise_model, const uint8_t *const data[3], - const uint8_t *const denoised[3], int w, int h, int stride[3], - int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) { - const int num_blocks_w = (w + block_size - 1) / block_size; - const int num_blocks_h = (h + block_size - 1) / block_size; - int y_model_different = 0; - int num_blocks = 0; - int i = 0, channel = 0; - - if (block_size <= 1) { - fprintf(stderr, "block_size = %d must be > 1\n", block_size); - return AOM_NOISE_STATUS_INVALID_ARGUMENT; - } - - if (block_size < noise_model->params.lag * 2 + 1) { - fprintf(stderr, "block_size = %d must be >= %d\n", block_size, - noise_model->params.lag * 2 + 1); - return AOM_NOISE_STATUS_INVALID_ARGUMENT; - } - - // Clear the latest equation system - for (i = 0; i < 3; ++i) { - equation_system_clear(&noise_model->latest_state[i].eqns); - noise_model->latest_state[i].num_observations = 0; - noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver); - } - - // Check that we have enough flat blocks - for (i = 0; i < num_blocks_h * num_blocks_w; ++i) { - if (flat_blocks[i]) { - num_blocks++; - } - } - - if (num_blocks <= 1) { - fprintf(stderr, "Not enough flat blocks to update noise estimate\n"); - return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS; - } - - for (channel = 0; channel < 3; ++channel) { - int no_subsampling[2] = { 0, 0 }; - const uint8_t *alt_data = channel > 0 ? data[0] : 0; - const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0; - int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling; - const int is_chroma = channel != 0; - if (!data[channel] || !denoised[channel]) break; - if (!add_block_observations(noise_model, channel, data[channel], - denoised[channel], w, h, stride[channel], sub, - alt_data, alt_denoised, stride[0], flat_blocks, - block_size, num_blocks_w, num_blocks_h)) { - fprintf(stderr, "Adding block observation failed\n"); - return AOM_NOISE_STATUS_INTERNAL_ERROR; - } - - if (!ar_equation_system_solve(&noise_model->latest_state[channel], - is_chroma)) { - if (is_chroma) { - set_chroma_coefficient_fallback_soln( - &noise_model->latest_state[channel].eqns); - } else { - fprintf(stderr, "Solving latest noise equation system failed %d!\n", - channel); - return AOM_NOISE_STATUS_INTERNAL_ERROR; - } - } - - add_noise_std_observations( - noise_model, channel, noise_model->latest_state[channel].eqns.x, - data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, - stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h); - - if (!aom_noise_strength_solver_solve( - &noise_model->latest_state[channel].strength_solver)) { - fprintf(stderr, "Solving latest noise strength failed!\n"); - return AOM_NOISE_STATUS_INTERNAL_ERROR; - } - - // Check noise characteristics and return if error. - if (channel == 0 && - noise_model->combined_state[channel].strength_solver.num_equations > - 0 && - is_noise_model_different(noise_model)) { - y_model_different = 1; - } - - // Don't update the combined stats if the y model is different. - if (y_model_different) continue; - - noise_model->combined_state[channel].num_observations += - noise_model->latest_state[channel].num_observations; - equation_system_add(&noise_model->combined_state[channel].eqns, - &noise_model->latest_state[channel].eqns); - if (!ar_equation_system_solve(&noise_model->combined_state[channel], - is_chroma)) { - if (is_chroma) { - set_chroma_coefficient_fallback_soln( - &noise_model->combined_state[channel].eqns); - } else { - fprintf(stderr, "Solving combined noise equation system failed %d!\n", - channel); - return AOM_NOISE_STATUS_INTERNAL_ERROR; - } - } - - noise_strength_solver_add( - &noise_model->combined_state[channel].strength_solver, - &noise_model->latest_state[channel].strength_solver); - - if (!aom_noise_strength_solver_solve( - &noise_model->combined_state[channel].strength_solver)) { - fprintf(stderr, "Solving combined noise strength failed!\n"); - return AOM_NOISE_STATUS_INTERNAL_ERROR; - } - } - - return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE - : AOM_NOISE_STATUS_OK; -} - -void aom_noise_model_save_latest(aom_noise_model_t *noise_model) { - for (int c = 0; c < 3; c++) { - equation_system_copy(&noise_model->combined_state[c].eqns, - &noise_model->latest_state[c].eqns); - equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns, - &noise_model->latest_state[c].strength_solver.eqns); - noise_model->combined_state[c].strength_solver.num_equations = - noise_model->latest_state[c].strength_solver.num_equations; - noise_model->combined_state[c].num_observations = - noise_model->latest_state[c].num_observations; - noise_model->combined_state[c].ar_gain = - noise_model->latest_state[c].ar_gain; - } -} - -int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, - aom_film_grain_t *film_grain) { - if (noise_model->params.lag > 3) { - fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); - return 0; - } - uint16_t random_seed = film_grain->random_seed; - memset(film_grain, 0, sizeof(*film_grain)); - film_grain->random_seed = random_seed; - - film_grain->apply_grain = 1; - film_grain->update_parameters = 1; - - film_grain->ar_coeff_lag = noise_model->params.lag; - - // Convert the scaling functions to 8 bit values - aom_noise_strength_lut_t scaling_points[3]; - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0); - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1); - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2); - - // Both the domain and the range of the scaling functions in the film_grain - // are normalized to 8-bit (e.g., they are implicitly scaled during grain - // synthesis). - const double strength_divisor = 1 << (noise_model->params.bit_depth - 8); - double max_scaling_value = 1e-4; - for (int c = 0; c < 3; ++c) { - for (int i = 0; i < scaling_points[c].num_points; ++i) { - scaling_points[c].points[i][0] = - AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor); - scaling_points[c].points[i][1] = - AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor); - max_scaling_value = - AOMMAX(scaling_points[c].points[i][1], max_scaling_value); - } - } - - // Scaling_shift values are in the range [8,11] - const int max_scaling_value_log2 = - clamp((int)floor(log2(max_scaling_value) + 1), 2, 5); - film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2); - - const double scale_factor = 1 << (8 - max_scaling_value_log2); - film_grain->num_y_points = scaling_points[0].num_points; - film_grain->num_cb_points = scaling_points[1].num_points; - film_grain->num_cr_points = scaling_points[2].num_points; - - int(*film_grain_scaling[3])[2] = { - film_grain->scaling_points_y, - film_grain->scaling_points_cb, - film_grain->scaling_points_cr, - }; - for (int c = 0; c < 3; c++) { - for (int i = 0; i < scaling_points[c].num_points; ++i) { - film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5); - film_grain_scaling[c][i][1] = clamp( - (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255); - } - } - aom_noise_strength_lut_free(scaling_points + 0); - aom_noise_strength_lut_free(scaling_points + 1); - aom_noise_strength_lut_free(scaling_points + 2); - - // Convert the ar_coeffs into 8-bit values - const int n_coeff = noise_model->combined_state[0].eqns.n; - double max_coeff = 1e-4, min_coeff = -1e-4; - double y_corr[2] = { 0, 0 }; - double avg_luma_strength = 0; - for (int c = 0; c < 3; c++) { - aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; - for (int i = 0; i < n_coeff; ++i) { - max_coeff = AOMMAX(max_coeff, eqns->x[i]); - min_coeff = AOMMIN(min_coeff, eqns->x[i]); - } - // Since the correlation between luma/chroma was computed in an already - // scaled space, we adjust it in the un-scaled space. - aom_noise_strength_solver_t *solver = - &noise_model->combined_state[c].strength_solver; - // Compute a weighted average of the strength for the channel. - double average_strength = 0, total_weight = 0; - for (int i = 0; i < solver->eqns.n; ++i) { - double w = 0; - for (int j = 0; j < solver->eqns.n; ++j) { - w += solver->eqns.A[i * solver->eqns.n + j]; - } - w = sqrt(w); - average_strength += solver->eqns.x[i] * w; - total_weight += w; - } - if (total_weight == 0) - average_strength = 1; - else - average_strength /= total_weight; - if (c == 0) { - avg_luma_strength = average_strength; - } else { - y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength; - max_coeff = AOMMAX(max_coeff, y_corr[c - 1]); - min_coeff = AOMMIN(min_coeff, y_corr[c - 1]); - } - } - // Shift value: AR coeffs range (values 6-9) - // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25) - film_grain->ar_coeff_shift = - clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))), - 6, 9); - double scale_ar_coeff = 1 << film_grain->ar_coeff_shift; - int *ar_coeffs[3] = { - film_grain->ar_coeffs_y, - film_grain->ar_coeffs_cb, - film_grain->ar_coeffs_cr, - }; - for (int c = 0; c < 3; ++c) { - aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; - for (int i = 0; i < n_coeff; ++i) { - ar_coeffs[c][i] = - clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127); - } - if (c > 0) { - ar_coeffs[c][n_coeff] = - clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127); - } - } - - // At the moment, the noise modeling code assumes that the chroma scaling - // functions are a function of luma. - film_grain->cb_mult = 128; // 8 bits - film_grain->cb_luma_mult = 192; // 8 bits - film_grain->cb_offset = 256; // 9 bits - - film_grain->cr_mult = 128; // 8 bits - film_grain->cr_luma_mult = 192; // 8 bits - film_grain->cr_offset = 256; // 9 bits - - film_grain->chroma_scaling_from_luma = 0; - film_grain->grain_scale_shift = 0; - film_grain->overlap_flag = 1; - return 1; -} - -static void pointwise_multiply(const float *a, float *b, int n) { - for (int i = 0; i < n; ++i) { - b[i] *= a[i]; - } -} - -static float *get_half_cos_window(int block_size) { - float *window_function = - (float *)aom_malloc(block_size * block_size * sizeof(*window_function)); - for (int y = 0; y < block_size; ++y) { - const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2); - for (int x = 0; x < block_size; ++x) { - const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2); - window_function[y * block_size + x] = (float)(cos_yd * cos_xd); - } - } - return window_function; -} - -#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \ - static void dither_and_quantize_##suffix( \ - float *result, int result_stride, INT_TYPE *denoised, int w, int h, \ - int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \ - float block_normalization) { \ - for (int y = 0; y < (h >> chroma_sub_h); ++y) { \ - for (int x = 0; x < (w >> chroma_sub_w); ++x) { \ - const int result_idx = \ - (y + (block_size >> chroma_sub_h)) * result_stride + x + \ - (block_size >> chroma_sub_w); \ - INT_TYPE new_val = (INT_TYPE)AOMMIN( \ - AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \ - block_normalization); \ - const float err = \ - -(((float)new_val) / block_normalization - result[result_idx]); \ - denoised[y * stride + x] = new_val; \ - if (x + 1 < (w >> chroma_sub_w)) { \ - result[result_idx + 1] += err * 7.0f / 16.0f; \ - } \ - if (y + 1 < (h >> chroma_sub_h)) { \ - if (x > 0) { \ - result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \ - } \ - result[result_idx + result_stride] += err * 5.0f / 16.0f; \ - if (x + 1 < (w >> chroma_sub_w)) { \ - result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \ - } \ - } \ - } \ - } \ - } - -DITHER_AND_QUANTIZE(uint8_t, lowbd); -DITHER_AND_QUANTIZE(uint16_t, highbd); - -int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], - int w, int h, int stride[3], int chroma_sub[2], - float *noise_psd[3], int block_size, int bit_depth, - int use_highbd) { - float *plane = NULL, *block = NULL, *window_full = NULL, - *window_chroma = NULL; - double *block_d = NULL, *plane_d = NULL; - struct aom_noise_tx_t *tx_full = NULL; - struct aom_noise_tx_t *tx_chroma = NULL; - const int num_blocks_w = (w + block_size - 1) / block_size; - const int num_blocks_h = (h + block_size - 1) / block_size; - const int result_stride = (num_blocks_w + 2) * block_size; - const int result_height = (num_blocks_h + 2) * block_size; - float *result = NULL; - int init_success = 1; - aom_flat_block_finder_t block_finder_full; - aom_flat_block_finder_t block_finder_chroma; - const float kBlockNormalization = (float)((1 << bit_depth) - 1); - if (chroma_sub[0] != chroma_sub[1]) { - fprintf(stderr, - "aom_wiener_denoise_2d doesn't handle different chroma " - "subsampling"); - return 0; - } - init_success &= aom_flat_block_finder_init(&block_finder_full, block_size, - bit_depth, use_highbd); - result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride * - sizeof(*result)); - plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane)); - block = - (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block)); - block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d)); - plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d)); - window_full = get_half_cos_window(block_size); - tx_full = aom_noise_tx_malloc(block_size); - - if (chroma_sub[0] != 0) { - init_success &= aom_flat_block_finder_init(&block_finder_chroma, - block_size >> chroma_sub[0], - bit_depth, use_highbd); - window_chroma = get_half_cos_window(block_size >> chroma_sub[0]); - tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]); - } else { - window_chroma = window_full; - tx_chroma = tx_full; - } - - init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) && - (plane_d != NULL) && (block != NULL) && (block_d != NULL) && - (window_full != NULL) && (window_chroma != NULL) && - (result != NULL); - for (int c = init_success ? 0 : 3; c < 3; ++c) { - float *window_function = c == 0 ? window_full : window_chroma; - aom_flat_block_finder_t *block_finder = &block_finder_full; - const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0; - const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0; - struct aom_noise_tx_t *tx = - (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full; - if (!data[c] || !denoised[c]) continue; - if (c > 0 && chroma_sub[0] != 0) { - block_finder = &block_finder_chroma; - } - memset(result, 0, sizeof(*result) * result_stride * result_height); - // Do overlapped block processing (half overlapped). The block rows can - // easily be done in parallel - for (int offsy = 0; offsy < (block_size >> chroma_sub_h); - offsy += (block_size >> chroma_sub_h) / 2) { - for (int offsx = 0; offsx < (block_size >> chroma_sub_w); - offsx += (block_size >> chroma_sub_w) / 2) { - // Pad the boundary when processing each block-set. - for (int by = -1; by < num_blocks_h; ++by) { - for (int bx = -1; bx < num_blocks_w; ++bx) { - const int pixels_per_block = - (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h); - aom_flat_block_finder_extract_block( - block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h, - stride[c], bx * (block_size >> chroma_sub_w) + offsx, - by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d); - for (int j = 0; j < pixels_per_block; ++j) { - block[j] = (float)block_d[j]; - plane[j] = (float)plane_d[j]; - } - pointwise_multiply(window_function, block, pixels_per_block); - aom_noise_tx_forward(tx, block); - aom_noise_tx_filter(tx, noise_psd[c]); - aom_noise_tx_inverse(tx, block); - - // Apply window function to the plane approximation (we will apply - // it to the sum of plane + block when composing the results). - pointwise_multiply(window_function, plane, pixels_per_block); - - for (int y = 0; y < (block_size >> chroma_sub_h); ++y) { - const int y_result = - y + (by + 1) * (block_size >> chroma_sub_h) + offsy; - for (int x = 0; x < (block_size >> chroma_sub_w); ++x) { - const int x_result = - x + (bx + 1) * (block_size >> chroma_sub_w) + offsx; - result[y_result * result_stride + x_result] += - (block[y * (block_size >> chroma_sub_w) + x] + - plane[y * (block_size >> chroma_sub_w) + x]) * - window_function[y * (block_size >> chroma_sub_w) + x]; - } - } - } - } - } - } - if (use_highbd) { - dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c], - w, h, stride[c], chroma_sub_w, chroma_sub_h, - block_size, kBlockNormalization); - } else { - dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h, - stride[c], chroma_sub_w, chroma_sub_h, - block_size, kBlockNormalization); - } - } - aom_free(result); - aom_free(plane); - aom_free(block); - aom_free(plane_d); - aom_free(block_d); - aom_free(window_full); - - aom_noise_tx_free(tx_full); - - aom_flat_block_finder_free(&block_finder_full); - if (chroma_sub[0] != 0) { - aom_flat_block_finder_free(&block_finder_chroma); - aom_free(window_chroma); - aom_noise_tx_free(tx_chroma); - } - return init_success; -} - -struct aom_denoise_and_model_t { - int block_size; - int bit_depth; - float noise_level; - - // Size of current denoised buffer and flat_block buffer - int width; - int height; - int y_stride; - int uv_stride; - int num_blocks_w; - int num_blocks_h; - - // Buffers for image and noise_psd allocated on the fly - float *noise_psd[3]; - uint8_t *denoised[3]; - uint8_t *flat_blocks; - - aom_flat_block_finder_t flat_block_finder; - aom_noise_model_t noise_model; -}; - -struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, - int block_size, - float noise_level) { - struct aom_denoise_and_model_t *ctx = - (struct aom_denoise_and_model_t *)aom_malloc( - sizeof(struct aom_denoise_and_model_t)); - if (!ctx) { - fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); - return NULL; - } - memset(ctx, 0, sizeof(*ctx)); - - ctx->block_size = block_size; - ctx->noise_level = noise_level; - ctx->bit_depth = bit_depth; - - ctx->noise_psd[0] = - aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); - ctx->noise_psd[1] = - aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); - ctx->noise_psd[2] = - aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); - if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { - fprintf(stderr, "Unable to allocate noise PSD buffers\n"); - aom_denoise_and_model_free(ctx); - return NULL; - } - return ctx; -} - -void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { - aom_free(ctx->flat_blocks); - for (int i = 0; i < 3; ++i) { - aom_free(ctx->denoised[i]); - aom_free(ctx->noise_psd[i]); - } - aom_noise_model_free(&ctx->noise_model); - aom_flat_block_finder_free(&ctx->flat_block_finder); - aom_free(ctx); -} - -static int denoise_and_model_realloc_if_necessary( - struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { - if (ctx->width == sd->y_width && ctx->height == sd->y_height && - ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) - return 1; - const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; - const int block_size = ctx->block_size; - - ctx->width = sd->y_width; - ctx->height = sd->y_height; - ctx->y_stride = sd->y_stride; - ctx->uv_stride = sd->uv_stride; - - for (int i = 0; i < 3; ++i) { - aom_free(ctx->denoised[i]); - ctx->denoised[i] = NULL; - } - aom_free(ctx->flat_blocks); - ctx->flat_blocks = NULL; - - ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd); - ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); - ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); - if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { - fprintf(stderr, "Unable to allocate denoise buffers\n"); - return 0; - } - ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; - ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; - ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); - - aom_flat_block_finder_free(&ctx->flat_block_finder); - if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, - ctx->bit_depth, use_highbd)) { - fprintf(stderr, "Unable to init flat block finder\n"); - return 0; - } - - const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, - ctx->bit_depth, use_highbd }; - aom_noise_model_free(&ctx->noise_model); - if (!aom_noise_model_init(&ctx->noise_model, params)) { - fprintf(stderr, "Unable to init noise model\n"); - return 0; - } - - // Simply use a flat PSD (although we could use the flat blocks to estimate - // PSD) those to estimate an actual noise PSD) - const float y_noise_level = - aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); - const float uv_noise_level = aom_noise_psd_get_default_value( - ctx->block_size >> sd->subsampling_x, ctx->noise_level); - for (int i = 0; i < block_size * block_size; ++i) { - ctx->noise_psd[0][i] = y_noise_level; - ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; - } - return 1; -} - -int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *sd, - aom_film_grain_t *film_grain) { - const int block_size = ctx->block_size; - const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; - uint8_t *raw_data[3] = { - use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, - use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, - use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, - }; - const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; - int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; - int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; - - if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { - fprintf(stderr, "Unable to realloc buffers\n"); - return 0; - } - - aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, - sd->y_height, strides[0], ctx->flat_blocks); - - if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, - strides, chroma_sub_log2, ctx->noise_psd, - block_size, ctx->bit_depth, use_highbd)) { - fprintf(stderr, "Unable to denoise image\n"); - return 0; - } - - const aom_noise_status_t status = aom_noise_model_update( - &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, - sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, - block_size); - int have_noise_estimate = 0; - if (status == AOM_NOISE_STATUS_OK) { - have_noise_estimate = 1; - } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { - aom_noise_model_save_latest(&ctx->noise_model); - have_noise_estimate = 1; - } else { - // Unable to update noise model; proceed if we have a previous estimate. - have_noise_estimate = - (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); - } - - film_grain->apply_grain = 0; - if (have_noise_estimate) { - if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { - fprintf(stderr, "Unable to get grain parameters.\n"); - return 0; - } - if (!film_grain->random_seed) { - film_grain->random_seed = 7391; - } - memcpy(raw_data[0], ctx->denoised[0], - (strides[0] * sd->y_height) << use_highbd); - memcpy(raw_data[1], ctx->denoised[1], - (strides[1] * sd->uv_height) << use_highbd); - memcpy(raw_data[2], ctx->denoised[2], - (strides[2] * sd->uv_height) << use_highbd); - } - return 1; -} diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h deleted file mode 100644 index 049d5be15..000000000 --- a/third_party/aom/aom_dsp/noise_model.h +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_NOISE_MODEL_H_ -#define AOM_AOM_DSP_NOISE_MODEL_H_ - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -#include <stdint.h> -#include "aom_dsp/grain_synthesis.h" -#include "aom_scale/yv12config.h" - -/*!\brief Wrapper of data required to represent linear system of eqns and soln. - */ -typedef struct { - double *A; - double *b; - double *x; - int n; -} aom_equation_system_t; - -/*!\brief Representation of a piecewise linear curve - * - * Holds n points as (x, y) pairs, that store the curve. - */ -typedef struct { - double (*points)[2]; - int num_points; -} aom_noise_strength_lut_t; - -/*!\brief Init the noise strength lut with the given number of points*/ -int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points); - -/*!\brief Frees the noise strength lut. */ -void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut); - -/*!\brief Evaluate the lut at the point x. - * - * \param[in] lut The lut data. - * \param[in] x The coordinate to evaluate the lut. - */ -double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, - double x); - -/*!\brief Helper struct to model noise strength as a function of intensity. - * - * Internally, this structure holds a representation of a linear system - * of equations that models noise strength (standard deviation) as a - * function of intensity. The mapping is initially stored using a - * piecewise representation with evenly spaced bins that cover the entire - * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a - * constraint of the form: - * y_{i} (1 - a) + y_{i+1} a = y - * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and - * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding - * normal equations. - * - * As there may be missing data, the solution is regularized to get a - * complete set of values for the bins. A reduced representation after - * solving can be obtained by getting the corresponding noise_strength_lut_t. - */ -typedef struct { - aom_equation_system_t eqns; - double min_intensity; - double max_intensity; - int num_bins; - int num_equations; - double total; -} aom_noise_strength_solver_t; - -/*!\brief Initializes the noise solver with the given number of bins. - * - * Returns 0 if initialization fails. - * - * \param[in] solver The noise solver to be initialized. - * \param[in] num_bins Number of bins to use in the internal representation. - * \param[in] bit_depth The bit depth used to derive {min,max}_intensity. - */ -int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, - int num_bins, int bit_depth); -void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver); - -/*!\brief Gets the x coordinate of bin i. - * - * \param[in] i The bin whose coordinate to query. - */ -double aom_noise_strength_solver_get_center( - const aom_noise_strength_solver_t *solver, int i); - -/*!\brief Add an observation of the block mean intensity to its noise strength. - * - * \param[in] block_mean The average block intensity, - * \param[in] noise_std The observed noise strength. - */ -void aom_noise_strength_solver_add_measurement( - aom_noise_strength_solver_t *solver, double block_mean, double noise_std); - -/*!\brief Solves the current set of equations for the noise strength. */ -int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver); - -/*!\brief Fits a reduced piecewise linear lut to the internal solution - * - * \param[in] max_num_points The maximum number of output points - * \param[out] lut The output piecewise linear lut. - */ -int aom_noise_strength_solver_fit_piecewise( - const aom_noise_strength_solver_t *solver, int max_num_points, - aom_noise_strength_lut_t *lut); - -/*!\brief Helper for holding precomputed data for finding flat blocks. - * - * Internally a block is modeled with a low-order polynomial model. A - * planar model would be a bunch of equations like: - * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i - * for each point in the block. The system matrix A with row i as [y_i x_i 1] - * is maintained as is the inverse, inv(A'*A), so that the plane parameters - * can be fit for each block. - */ -typedef struct { - double *AtA_inv; - double *A; - int num_params; // The number of parameters used for internal low-order model - int block_size; // The block size the finder was initialized with - double normalization; // Normalization factor (1 / (2^(bit_depth) - 1)) - int use_highbd; // Whether input data should be interpreted as uint16 -} aom_flat_block_finder_t; - -/*!\brief Init the block_finder with the given block size, bit_depth */ -int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, - int block_size, int bit_depth, int use_highbd); -void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder); - -/*!\brief Helper to extract a block and low order "planar" model. */ -void aom_flat_block_finder_extract_block( - const aom_flat_block_finder_t *block_finder, const uint8_t *const data, - int w, int h, int stride, int offsx, int offsy, double *plane, - double *block); - -/*!\brief Runs the flat block finder on the input data. - * - * Find flat blocks in the input image data. Returns a map of - * flat_blocks, where the value of flat_blocks map will be non-zero - * when a block is determined to be flat. A higher value indicates a bigger - * confidence in the decision. - */ -int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, - const uint8_t *const data, int w, int h, - int stride, uint8_t *flat_blocks); - -// The noise shape indicates the allowed coefficients in the AR model. -typedef enum { - AOM_NOISE_SHAPE_DIAMOND = 0, - AOM_NOISE_SHAPE_SQUARE = 1 -} aom_noise_shape; - -// The parameters of the noise model include the shape type, lag, the -// bit depth of the input images provided, and whether the input images -// will be using uint16 (or uint8) representation. -typedef struct { - aom_noise_shape shape; - int lag; - int bit_depth; - int use_highbd; -} aom_noise_model_params_t; - -/*!\brief State of a noise model estimate for a single channel. - * - * This contains a system of equations that can be used to solve - * for the auto-regressive coefficients as well as a noise strength - * solver that can be used to model noise strength as a function of - * intensity. - */ -typedef struct { - aom_equation_system_t eqns; - aom_noise_strength_solver_t strength_solver; - int num_observations; // The number of observations in the eqn system - double ar_gain; // The gain of the current AR filter -} aom_noise_state_t; - -/*!\brief Complete model of noise for a planar video - * - * This includes a noise model for the latest frame and an aggregated - * estimate over all previous frames that had similar parameters. - */ -typedef struct { - aom_noise_model_params_t params; - aom_noise_state_t combined_state[3]; // Combined state per channel - aom_noise_state_t latest_state[3]; // Latest state per channel - int (*coords)[2]; // Offsets (x,y) of the coefficient samples - int n; // Number of parameters (size of coords) - int bit_depth; -} aom_noise_model_t; - -/*!\brief Result of a noise model update. */ -typedef enum { - AOM_NOISE_STATUS_OK = 0, - AOM_NOISE_STATUS_INVALID_ARGUMENT, - AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, - AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, - AOM_NOISE_STATUS_INTERNAL_ERROR, -} aom_noise_status_t; - -/*!\brief Initializes a noise model with the given parameters. - * - * Returns 0 on failure. - */ -int aom_noise_model_init(aom_noise_model_t *model, - const aom_noise_model_params_t params); -void aom_noise_model_free(aom_noise_model_t *model); - -/*!\brief Updates the noise model with a new frame observation. - * - * Updates the noise model with measurements from the given input frame and a - * denoised variant of it. Noise is sampled from flat blocks using the flat - * block map. - * - * Returns a noise_status indicating if the update was successful. If the - * Update was successful, the combined_state is updated with measurements from - * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise - * state will be updated with measurements from the provided frame. - * - * \param[in,out] noise_model The noise model to be updated - * \param[in] data Raw frame data - * \param[in] denoised Denoised frame data. - * \param[in] w Frame width - * \param[in] h Frame height - * \param[in] strides Stride of the planes - * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. - * \param[in] flat_blocks A map to blocks that have been determined flat - * \param[in] block_size The size of blocks. - */ -aom_noise_status_t aom_noise_model_update( - aom_noise_model_t *const noise_model, const uint8_t *const data[3], - const uint8_t *const denoised[3], int w, int h, int strides[3], - int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size); - -/*\brief Save the "latest" estimate into the "combined" estimate. - * - * This is meant to be called when the noise modeling detected a change - * in parameters (or for example, if a user wanted to reset estimation at - * a shot boundary). - */ -void aom_noise_model_save_latest(aom_noise_model_t *noise_model); - -/*!\brief Converts the noise_model parameters to the corresponding - * grain_parameters. - * - * The noise structs in this file are suitable for estimation (e.g., using - * floats), but the grain parameters in the bitstream are quantized. This - * function does the conversion by selecting the correct quantization levels. - */ -int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, - aom_film_grain_t *film_grain); - -/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd. - * - * \param[in] data Raw frame data - * \param[out] denoised Denoised frame data - * \param[in] w Frame width - * \param[in] h Frame height - * \param[in] stride Stride of the planes - * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. - * \param[in] noise_psd The power spectral density of the noise - * \param[in] block_size The size of blocks - * \param[in] bit_depth Bit depth of the image - * \param[in] use_highbd If true, uint8 pointers are interpreted as - * uint16 and stride is measured in uint16. - * This must be true when bit_depth >= 10. - */ -int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], - int w, int h, int stride[3], int chroma_sub_log2[2], - float *noise_psd[3], int block_size, int bit_depth, - int use_highbd); - -struct aom_denoise_and_model_t; - -/*!\brief Denoise the buffer and model the residual noise. - * - * This is meant to be called sequentially on input frames. The input buffer - * is denoised and the residual noise is modelled. The current noise estimate - * is populated in film_grain. Returns true on success. The grain.apply_grain - * parameter will be true when the input buffer was successfully denoised and - * grain was modelled. Returns false on error. - * - * \param[in] ctx Struct allocated with aom_denoise_and_model_alloc - * that holds some buffers for denoising and the current - * noise estimate. - * \param[in/out] buf The raw input buffer to be denoised. - * \param[out] grain Output film grain parameters - */ -int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain); - -/*!\brief Allocates a context that can be used for denoising and noise modeling. - * - * \param[in] bit_depth Bit depth of buffers this will be run on. - * \param[in] block_size Block size for noise modeling and flat block - * estimation - * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for - * higher levels of noise) - */ -struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, - int block_size, - float noise_level); - -/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc - */ -void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus -#endif // AOM_AOM_DSP_NOISE_MODEL_H_ diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c deleted file mode 100644 index 87e8e9fec..000000000 --- a/third_party/aom/aom_dsp/noise_util.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <math.h> - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "aom_dsp/noise_util.h" -#include "aom_dsp/fft_common.h" -#include "aom_mem/aom_mem.h" -#include "config/aom_dsp_rtcd.h" - -float aom_noise_psd_get_default_value(int block_size, float factor) { - return (factor * factor / 10000) * block_size * block_size / 8; -} - -// Internal representation of noise transform. It keeps track of the -// transformed data and a temporary working buffer to use during the -// transform. -struct aom_noise_tx_t { - float *tx_block; - float *temp; - int block_size; - void (*fft)(const float *, float *, float *); - void (*ifft)(const float *, float *, float *); -}; - -struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) { - struct aom_noise_tx_t *noise_tx = - (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t)); - if (!noise_tx) return NULL; - memset(noise_tx, 0, sizeof(*noise_tx)); - switch (block_size) { - case 2: - noise_tx->fft = aom_fft2x2_float; - noise_tx->ifft = aom_ifft2x2_float; - break; - case 4: - noise_tx->fft = aom_fft4x4_float; - noise_tx->ifft = aom_ifft4x4_float; - break; - case 8: - noise_tx->fft = aom_fft8x8_float; - noise_tx->ifft = aom_ifft8x8_float; - break; - case 16: - noise_tx->fft = aom_fft16x16_float; - noise_tx->ifft = aom_ifft16x16_float; - break; - case 32: - noise_tx->fft = aom_fft32x32_float; - noise_tx->ifft = aom_ifft32x32_float; - break; - default: - aom_free(noise_tx); - fprintf(stderr, "Unsupported block size %d\n", block_size); - return NULL; - } - noise_tx->block_size = block_size; - noise_tx->tx_block = (float *)aom_memalign( - 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); - noise_tx->temp = (float *)aom_memalign( - 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size); - if (!noise_tx->tx_block || !noise_tx->temp) { - aom_noise_tx_free(noise_tx); - return NULL; - } - // Clear the buffers up front. Some outputs of the forward transform are - // real only (the imaginary component will never be touched) - memset(noise_tx->tx_block, 0, - 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); - memset(noise_tx->temp, 0, - 2 * sizeof(*noise_tx->temp) * block_size * block_size); - return noise_tx; -} - -void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) { - noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block); -} - -void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) { - const int block_size = noise_tx->block_size; - const float kBeta = 1.1f; - const float kEps = 1e-6f; - for (int y = 0; y < block_size; ++y) { - for (int x = 0; x < block_size; ++x) { - int i = y * block_size + x; - float *c = noise_tx->tx_block + 2 * i; - const float p = c[0] * c[0] + c[1] * c[1]; - if (p > kBeta * psd[i] && p > 1e-6) { - noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps); - noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps); - } else { - noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta; - noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta; - } - } - } -} - -void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) { - const int n = noise_tx->block_size * noise_tx->block_size; - noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data); - for (int i = 0; i < n; ++i) { - data[i] /= n; - } -} - -void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx, - float *psd) { - const int block_size = noise_tx->block_size; - for (int yb = 0; yb < block_size; ++yb) { - for (int xb = 0; xb <= block_size / 2; ++xb) { - float *c = noise_tx->tx_block + 2 * (yb * block_size + xb); - psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1]; - } - } -} - -void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) { - if (!noise_tx) return; - aom_free(noise_tx->tx_block); - aom_free(noise_tx->temp); - aom_free(noise_tx); -} - -double aom_normalized_cross_correlation(const double *a, const double *b, - int n) { - double c = 0; - double a_len = 0; - double b_len = 0; - for (int i = 0; i < n; ++i) { - a_len += a[i] * a[i]; - b_len += b[i] * b[i]; - c += a[i] * b[i]; - } - return c / (sqrt(a_len) * sqrt(b_len)); -} - -int aom_noise_data_validate(const double *data, int w, int h) { - const double kVarianceThreshold = 2; - const double kMeanThreshold = 2; - - int x = 0, y = 0; - int ret_value = 1; - double var = 0, mean = 0; - double *mean_x, *mean_y, *var_x, *var_y; - - // Check that noise variance is not increasing in x or y - // and that the data is zero mean. - mean_x = (double *)aom_malloc(sizeof(*mean_x) * w); - var_x = (double *)aom_malloc(sizeof(*var_x) * w); - mean_y = (double *)aom_malloc(sizeof(*mean_x) * h); - var_y = (double *)aom_malloc(sizeof(*var_y) * h); - - memset(mean_x, 0, sizeof(*mean_x) * w); - memset(var_x, 0, sizeof(*var_x) * w); - memset(mean_y, 0, sizeof(*mean_y) * h); - memset(var_y, 0, sizeof(*var_y) * h); - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - const double d = data[y * w + x]; - var_x[x] += d * d; - var_y[y] += d * d; - mean_x[x] += d; - mean_y[y] += d; - var += d * d; - mean += d; - } - } - mean /= (w * h); - var = var / (w * h) - mean * mean; - - for (y = 0; y < h; ++y) { - mean_y[y] /= h; - var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y]; - if (fabs(var_y[y] - var) >= kVarianceThreshold) { - fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var); - ret_value = 0; - break; - } - if (fabs(mean_y[y] - mean) >= kMeanThreshold) { - fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean); - ret_value = 0; - break; - } - } - - for (x = 0; x < w; ++x) { - mean_x[x] /= w; - var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x]; - if (fabs(var_x[x] - var) >= kVarianceThreshold) { - fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var); - ret_value = 0; - break; - } - if (fabs(mean_x[x] - mean) >= kMeanThreshold) { - fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean); - ret_value = 0; - break; - } - } - - aom_free(mean_x); - aom_free(mean_y); - aom_free(var_x); - aom_free(var_y); - - return ret_value; -} diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h deleted file mode 100644 index 2284a171a..000000000 --- a/third_party/aom/aom_dsp/noise_util.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_NOISE_UTIL_H_ -#define AOM_AOM_DSP_NOISE_UTIL_H_ - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// aom_noise_tx_t is an abstraction of a transform that is used for denoising. -// It is meant to be lightweight and does hold the transformed data (as -// the user should not be manipulating the transformed data directly). -struct aom_noise_tx_t; - -// Allocates and returns a aom_noise_tx_t useful for denoising the given -// block_size. The resulting aom_noise_tx_t should be free'd with -// aom_noise_tx_free. -struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size); -void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx); - -// Transforms the internal data and holds it in the aom_noise_tx's internal -// buffer. For compatibility with existing SIMD implementations, "data" must -// be 32-byte aligned. -void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx, - const float *data); - -// Filters aom_noise_tx's internal data using the provided noise power spectral -// density. The PSD must be at least block_size * block_size and should be -// populated with a constant or via estimates taken from -// aom_noise_tx_add_energy. -void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd); - -// Performs an inverse transform using the internal transform data. -// For compatibility with existing SIMD implementations, "data" must be 32-byte -// aligned. -void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data); - -// Aggregates the power of the buffered transform data into the psd buffer. -void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx, - float *psd); - -// Returns a default value suitable for denosing a transform of the given -// block_size. The noise "factor" determines the strength of the noise to -// be removed. A value of about 2.5 can be used for moderate denoising, -// where a value of 5.0 can be used for a high level of denoising. -float aom_noise_psd_get_default_value(int block_size, float factor); - -// Computes normalized cross correlation of two vectors a and b of length n. -double aom_normalized_cross_correlation(const double *a, const double *b, - int n); - -// Validates the correlated noise in the data buffer of size (w, h). -int aom_noise_data_validate(const double *data, int w, int h); - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus - -#endif // AOM_AOM_DSP_NOISE_UTIL_H_ diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h deleted file mode 100644 index f3d87f264..000000000 --- a/third_party/aom/aom_dsp/postproc.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_POSTPROC_H_ -#define AOM_AOM_DSP_POSTPROC_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -// Fills a noise buffer with gaussian noise strength determined by sigma. -int aom_setup_noise(double sigma, int size, char *noise); - -#ifdef __cplusplus -} -#endif - -#endif // AOM_AOM_DSP_POSTPROC_H_ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h deleted file mode 100644 index d003a986e..000000000 --- a/third_party/aom/aom_dsp/prob.h +++ /dev/null @@ -1,671 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_PROB_H_ -#define AOM_AOM_DSP_PROB_H_ - -#include <assert.h> -#include <stdio.h> - -#include "config/aom_config.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/entcode.h" -#include "aom_ports/bitops.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// TODO(negge): Rename this aom_prob once we remove vpxbool. -typedef uint16_t aom_cdf_prob; - -#define CDF_SIZE(x) ((x) + 1) -#define CDF_PROB_BITS 15 -#define CDF_PROB_TOP (1 << CDF_PROB_BITS) -#define CDF_INIT_TOP 32768 -#define CDF_SHIFT (15 - CDF_PROB_BITS) -/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative - probability (an "inverse" CDF). - This function converts from one representation to the other (and is its own - inverse).*/ -#define AOM_ICDF(x) (CDF_PROB_TOP - (x)) - -#if CDF_SHIFT == 0 - -#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF4(a0, a1, a2) \ - AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF5(a0, a1, a2, a3) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF6(a0, a1, a2, a3, a4) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ - AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ - AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ - AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ - a14) \ - AOM_ICDF(a0) \ - , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ - AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ - AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \ - AOM_ICDF(CDF_PROB_TOP), 0 - -#else -#define AOM_CDF2(a0) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \ - ((CDF_INIT_TOP - 2) >> 1)) / \ - ((CDF_INIT_TOP - 2)) + \ - 1) \ - , AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF3(a0, a1) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ - ((CDF_INIT_TOP - 3) >> 1)) / \ - ((CDF_INIT_TOP - 3)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ - ((CDF_INIT_TOP - 3) >> 1)) / \ - ((CDF_INIT_TOP - 3)) + \ - 2), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF4(a0, a1, a2) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ - ((CDF_INIT_TOP - 4) >> 1)) / \ - ((CDF_INIT_TOP - 4)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ - ((CDF_INIT_TOP - 4) >> 1)) / \ - ((CDF_INIT_TOP - 4)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ - ((CDF_INIT_TOP - 4) >> 1)) / \ - ((CDF_INIT_TOP - 4)) + \ - 3), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF5(a0, a1, a2, a3) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ - ((CDF_INIT_TOP - 5) >> 1)) / \ - ((CDF_INIT_TOP - 5)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ - ((CDF_INIT_TOP - 5) >> 1)) / \ - ((CDF_INIT_TOP - 5)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ - ((CDF_INIT_TOP - 5) >> 1)) / \ - ((CDF_INIT_TOP - 5)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ - ((CDF_INIT_TOP - 5) >> 1)) / \ - ((CDF_INIT_TOP - 5)) + \ - 4), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF6(a0, a1, a2, a3, a4) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ - ((CDF_INIT_TOP - 6) >> 1)) / \ - ((CDF_INIT_TOP - 6)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ - ((CDF_INIT_TOP - 6) >> 1)) / \ - ((CDF_INIT_TOP - 6)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ - ((CDF_INIT_TOP - 6) >> 1)) / \ - ((CDF_INIT_TOP - 6)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ - ((CDF_INIT_TOP - 6) >> 1)) / \ - ((CDF_INIT_TOP - 6)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ - ((CDF_INIT_TOP - 6) >> 1)) / \ - ((CDF_INIT_TOP - 6)) + \ - 5), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ - ((CDF_INIT_TOP - 7) >> 1)) / \ - ((CDF_INIT_TOP - 7)) + \ - 6), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ - ((CDF_INIT_TOP - 8) >> 1)) / \ - ((CDF_INIT_TOP - 8)) + \ - 7), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ - ((CDF_INIT_TOP - 9) >> 1)) / \ - ((CDF_INIT_TOP - 9)) + \ - 8), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ - ((CDF_INIT_TOP - 10) >> 1)) / \ - ((CDF_INIT_TOP - 10)) + \ - 9), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ - ((CDF_INIT_TOP - 11) >> 1)) / \ - ((CDF_INIT_TOP - 11)) + \ - 10), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 10), \ - AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ - ((CDF_INIT_TOP - 12) >> 1)) / \ - ((CDF_INIT_TOP - 12)) + \ - 11), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 10), \ - AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 11), \ - AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ - ((CDF_INIT_TOP - 13) >> 1)) / \ - ((CDF_INIT_TOP - 13)) + \ - 12), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 10), \ - AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 11), \ - AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 12), \ - AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ - ((CDF_INIT_TOP - 14) >> 1)) / \ - ((CDF_INIT_TOP - 14)) + \ - 13), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 10), \ - AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 11), \ - AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 12), \ - AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 13), \ - AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ - ((CDF_INIT_TOP - 15) >> 1)) / \ - ((CDF_INIT_TOP - 15)) + \ - 14), \ - AOM_ICDF(CDF_PROB_TOP), 0 -#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ - a14) \ - AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 1) \ - , \ - AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 2), \ - AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 3), \ - AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 4), \ - AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 5), \ - AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 6), \ - AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 7), \ - AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 8), \ - AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 9), \ - AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 10), \ - AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 11), \ - AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 12), \ - AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 13), \ - AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 14), \ - AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ - ((CDF_INIT_TOP - 16) >> 1)) / \ - ((CDF_INIT_TOP - 16)) + \ - 15), \ - AOM_ICDF(CDF_PROB_TOP), 0 - -#endif - -static INLINE uint8_t get_prob(unsigned int num, unsigned int den) { - assert(den != 0); - { - const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); - // (p > 255) ? 255 : (p < 1) ? 1 : p; - const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); - return (uint8_t)clipped_prob; - } -} - -static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { - int rate; - int i, tmp; - - static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2 }; - assert(nsymbs < 17); - rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) + - nsymbs2speed[nsymbs]; // + get_msb(nsymbs); - tmp = AOM_ICDF(0); - - // Single loop (faster) - for (i = 0; i < nsymbs - 1; ++i) { - tmp = (i == val) ? 0 : tmp; - if (tmp < cdf[i]) { - cdf[i] -= ((cdf[i] - tmp) >> rate); - } else { - cdf[i] += ((tmp - cdf[i]) >> rate); - } - } - cdf[nsymbs] += (cdf[nsymbs] < 32); -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c deleted file mode 100644 index 50f376a4a..000000000 --- a/third_party/aom/aom_dsp/psnr.c +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <math.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/psnr.h" -#include "aom_scale/yv12config.h" - -double aom_sse_to_psnr(double samples, double peak, double sse) { - if (sse > 0.0) { - const double psnr = 10.0 * log10(samples * peak * peak / sse); - return psnr > MAX_PSNR ? MAX_PSNR : psnr; - } else { - return MAX_PSNR; - } -} - -/* TODO(yaowu): The block_variance calls the unoptimized versions of variance() - * and highbd_8_variance(). It should not. - */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { - int i, j; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - int64_t tsum = 0; - uint64_t tsse = 0; - for (int i = 0; i < h; ++i) { - int32_t lsum = 0; - for (int j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; - lsum += diff; - tsse += (uint32_t)(diff * diff); - } - tsum += lsum; - a += a_stride; - b += b_stride; - } - *sum = tsum; - *sse = tsse; -} - -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - const int dw = width % 16; - const int dh = height % 16; - int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; - int x, y; - - if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; - } - - if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; - } - - for (y = 0; y < height / 16; ++y) { - const uint8_t *pa = a; - const uint8_t *pb = b; - for (x = 0; x < width / 16; ++x) { - aom_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; - - pa += 16; - pb += 16; - } - - a += 16 * a_stride; - b += 16 * b_stride; - } - - return total_sse; -} - -static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int width, - int height, unsigned int input_shift) { - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - int64_t total_sse = 0; - int x, y; - for (y = 0; y < height; ++y) { - for (x = 0; x < width; ++x) { - int64_t diff; - diff = (a[x] >> input_shift) - (b[x] >> input_shift); - total_sse += diff * diff; - } - a += a_stride; - b += b_stride; - } - return total_sse; -} - -static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int64_t total_sse = 0; - int x, y; - const int dw = width % 16; - const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; - if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; - } - if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; - } - for (y = 0; y < height / 16; ++y) { - const uint8_t *pa = a; - const uint8_t *pb = b; - for (x = 0; x < width / 16; ++x) { - aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; - pa += 16; - pb += 16; - } - a += 16 * a_stride; - b += 16 * b_stride; - } - return total_sse; -} - -int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height) { - return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, - b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, - width, height); -} - -int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->y_crop_width == b->y_crop_width); - assert(a->y_crop_height == b->y_crop_height); - - return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height); -} - -int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height) { - return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, - b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, - width, height); -} - -int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->uv_crop_width == b->uv_crop_width); - assert(a->uv_crop_height == b->uv_crop_height); - - return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, - a->uv_crop_width, a->uv_crop_height); -} - -int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height) { - return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, - b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, - width, height); -} - -int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->uv_crop_width == b->uv_crop_width); - assert(a->uv_crop_height == b->uv_crop_height); - - return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, - a->uv_crop_width, a->uv_crop_height); -} - -int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height) { - return highbd_get_sse( - a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, - b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); -} - -int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->y_crop_width == b->y_crop_width); - assert(a->y_crop_height == b->y_crop_height); - assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - - return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height); -} - -int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height) { - return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, - a->uv_stride, - b->u_buffer + vstart * b->uv_stride + hstart, - b->uv_stride, width, height); -} - -int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->uv_crop_width == b->uv_crop_width); - assert(a->uv_crop_height == b->uv_crop_height); - assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - - return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, - a->uv_crop_width, a->uv_crop_height); -} - -int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height) { - return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, - a->uv_stride, - b->v_buffer + vstart * b->uv_stride + hstart, - b->uv_stride, width, height); -} - -int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { - assert(a->uv_crop_width == b->uv_crop_width); - assert(a->uv_crop_height == b->uv_crop_height); - assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - - return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, - a->uv_crop_width, a->uv_crop_height); -} - -int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int plane, int highbd) { - if (highbd) { - switch (plane) { - case 0: return aom_highbd_get_y_sse(a, b); - case 1: return aom_highbd_get_u_sse(a, b); - case 2: return aom_highbd_get_v_sse(a, b); - default: assert(plane >= 0 && plane <= 2); return 0; - } - } - switch (plane) { - case 0: return aom_get_y_sse(a, b); - case 1: return aom_get_u_sse(a, b); - case 2: return aom_get_v_sse(a, b); - default: assert(plane >= 0 && plane <= 2); return 0; - } -} - -void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, - uint32_t bit_depth, uint32_t in_bit_depth) { - const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; - const int heights[3] = { a->y_crop_height, a->uv_crop_height, - a->uv_crop_height }; - const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; - int i; - uint64_t total_sse = 0; - uint32_t total_samples = 0; - const double peak = (double)((1 << in_bit_depth) - 1); - const unsigned int input_shift = bit_depth - in_bit_depth; - - for (i = 0; i < 3; ++i) { - const int w = widths[i]; - const int h = heights[i]; - const uint32_t samples = w * h; - uint64_t sse; - if (a->flags & YV12_FLAG_HIGHBITDEPTH) { - if (input_shift) { - sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], - b_strides[i], w, h, input_shift); - } else { - sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], - b_strides[i], w, h); - } - } else { - sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, - h); - } - psnr->sse[1 + i] = sse; - psnr->samples[1 + i] = samples; - psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); - - total_sse += sse; - total_samples += samples; - } - - psnr->sse[0] = total_sse; - psnr->samples[0] = total_samples; - psnr->psnr[0] = - aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); -} - -void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr) { - static const double peak = 255.0; - const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; - const int heights[3] = { a->y_crop_height, a->uv_crop_height, - a->uv_crop_height }; - const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; - int i; - uint64_t total_sse = 0; - uint32_t total_samples = 0; - - for (i = 0; i < 3; ++i) { - const int w = widths[i]; - const int h = heights[i]; - const uint32_t samples = w * h; - const uint64_t sse = - get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); - psnr->sse[1 + i] = sse; - psnr->samples[1 + i] = samples; - psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); - - total_sse += sse; - total_samples += samples; - } - - psnr->sse[0] = total_sse; - psnr->samples[0] = total_samples; - psnr->psnr[0] = - aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); -} diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h deleted file mode 100644 index 58e4e71ee..000000000 --- a/third_party/aom/aom_dsp/psnr.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_PSNR_H_ -#define AOM_AOM_DSP_PSNR_H_ - -#include "aom_scale/yv12config.h" - -#define MAX_PSNR 100.0 - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - double psnr[4]; // total/y/u/v - uint64_t sse[4]; // total/y/u/v - uint32_t samples[4]; // total/y/u/v -} PSNR_STATS; - -/*!\brief Converts SSE to PSNR - * - * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). - * - * \param[in] samples Number of samples - * \param[in] peak Max sample value - * \param[in] sse Sum of squared errors - */ -double aom_sse_to_psnr(double samples, double peak, double sse); -int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height); -int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); -int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height); -int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); -int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, int width, - int vstart, int height); -int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); -int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int plane, int highbd); -int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height); -int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b); -int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height); -int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b); -int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, int hstart, - int width, int vstart, int height); -int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b); -void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, - unsigned int bit_depth, unsigned int in_bit_depth); -void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr); - -double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *phvs_y, - double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); -#ifdef __cplusplus -} // extern "C" -#endif -#endif // AOM_AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c deleted file mode 100644 index 30fe21d9c..000000000 --- a/third_party/aom/aom_dsp/psnrhvs.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - * - * This code was originally written by: Gregory Maxwell, at the Daala - * project. - */ - -#include <assert.h> -#include <math.h> -#include <stdio.h> -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/psnr.h" -#include "aom_dsp/ssim.h" -#include "aom_ports/system_state.h" - -static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, - int xstride) { - int i, j; - (void)xstride; - aom_fdct8x8(x, y, ystride); - for (i = 0; i < 8; i++) - for (j = 0; j < 8; j++) - *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; -} - -static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, - int xstride) { - int i, j; - (void)xstride; - aom_highbd_fdct8x8(x, y, ystride); - for (i = 0; i < 8; i++) - for (j = 0; j < 8; j++) - *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; -} - -/* Normalized inverse quantization matrix for 8x8 DCT at the point of - * transparency. This is not the JPEG based matrix from the paper, - this one gives a slightly higher MOS agreement.*/ -static const double csf_y[8][8] = { - { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, - 0.678296995242, 0.466224900598, 0.3265091542 }, - { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, - 0.868920337363, 0.61280991668, 0.436405793551 }, - { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, - 0.670882927016, 0.501731932449, 0.372504254596 }, - { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, - 0.48309405692, 0.380429446972, 0.295774038565 }, - { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, - 0.352889268808, 0.283006984131, 0.226951348204 }, - { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, - 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, - { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, - 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, - { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, - 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } -}; -static const double csf_cb420[8][8] = { - { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, - 0.898018824055, 0.74725392039, 0.615105596242 }, - { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, - 1.17428548929, 0.996404342439, 0.830890433625 }, - { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, - 0.960060382087, 0.849823426169, 0.731221236837 }, - { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, - 0.751437590932, 0.685398513368, 0.608694761374 }, - { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, - 0.605503172737, 0.55002013668, 0.495804539034 }, - { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, - 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, - { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, - 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, - { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, - 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } -}; -static const double csf_cr420[8][8] = { - { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, - 0.867069376285, 0.721500455585, 0.593906509971 }, - { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, - 1.13381474809, 0.962064122248, 0.802254508198 }, - { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, - 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, - { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, - 0.725539939514, 0.661776842059, 0.587716619023 }, - { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, - 0.584635025748, 0.531064164893, 0.478717061273 }, - { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, - 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, - { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, - 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, - { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, - 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } -}; - -static double convert_score_db(double _score, double _weight, int bit_depth) { - int16_t pix_max = 255; - assert(_score * _weight >= 0.0); - if (bit_depth == 10) - pix_max = 1023; - else if (bit_depth == 12) - pix_max = 4095; - - if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; - return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); -} - -static double calc_psnrhvs(const unsigned char *src, int _systride, - const unsigned char *dst, int _dystride, double _par, - int _w, int _h, int _step, const double _csf[8][8], - uint32_t _shift, int buf_is_hbd) { - double ret; - const uint8_t *_src8 = src; - const uint8_t *_dst8 = dst; - const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); - const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); - DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); - DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); - DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); - DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); - double mask[8][8]; - int pixels; - int x; - int y; - (void)_par; - ret = pixels = 0; - /*In the PSNR-HVS-M paper[1] the authors describe the construction of - their masking table as "we have used the quantization table for the - color component Y of JPEG [6] that has been also obtained on the - basis of CSF. Note that the values in quantization table JPEG have - been normalized and then squared." Their CSF matrix (from PSNR-HVS) - was also constructed from the JPEG matrices. I can not find any obvious - scheme of normalizing to produce their table, but if I multiply their - CSF by 0.38857 and square the result I get their masking table. - I have no idea where this constant comes from, but deviating from it - too greatly hurts MOS agreement. - - [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, - Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking - of DCT basis functions", CD-ROM Proceedings of the Third - International Workshop on Video Processing and Quality Metrics for Consumer - Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ - for (x = 0; x < 8; x++) - for (y = 0; y < 8; y++) - mask[x][y] = - (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); - for (y = 0; y < _h - 7; y += _step) { - for (x = 0; x < _w - 7; x += _step) { - int i; - int j; - double s_means[4]; - double d_means[4]; - double s_vars[4]; - double d_vars[4]; - double s_gmean = 0; - double d_gmean = 0; - double s_gvar = 0; - double d_gvar = 0; - double s_mask = 0; - double d_mask = 0; - for (i = 0; i < 4; i++) - s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - int sub = ((i & 12) >> 2) + ((j & 12) >> 1); - if (!buf_is_hbd) { - dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; - dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; - } else { - dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; - dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; - } - s_gmean += dct_s[i * 8 + j]; - d_gmean += dct_d[i * 8 + j]; - s_means[sub] += dct_s[i * 8 + j]; - d_means[sub] += dct_d[i * 8 + j]; - } - } - s_gmean /= 64.f; - d_gmean /= 64.f; - for (i = 0; i < 4; i++) s_means[i] /= 16.f; - for (i = 0; i < 4; i++) d_means[i] /= 16.f; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - int sub = ((i & 12) >> 2) + ((j & 12) >> 1); - s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean); - d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean); - s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) * - (dct_s[i * 8 + j] - s_means[sub]); - d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) * - (dct_d[i * 8 + j] - d_means[sub]); - } - } - s_gvar *= 1 / 63.f * 64; - d_gvar *= 1 / 63.f * 64; - for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16; - for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16; - if (s_gvar > 0) - s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; - if (d_gvar > 0) - d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar; - if (!buf_is_hbd) { - od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); - od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); - } else { - hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); - hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); - } - for (i = 0; i < 8; i++) - for (j = (i == 0); j < 8; j++) - s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; - for (i = 0; i < 8; i++) - for (j = (i == 0); j < 8; j++) - d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j]; - s_mask = sqrt(s_mask * s_gvar) / 32.f; - d_mask = sqrt(d_mask * d_gvar) / 32.f; - if (d_mask > s_mask) s_mask = d_mask; - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - double err; - err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); - if (i != 0 || j != 0) - err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; - ret += (err * _csf[i][j]) * (err * _csf[i][j]); - pixels++; - } - } - } - } - if (pixels <= 0) return 0; - ret /= pixels; - return ret; -} - -double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, - double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, - uint32_t bd, uint32_t in_bd) { - double psnrhvs; - const double par = 1.0; - const int step = 7; - uint32_t bd_shift = 0; - aom_clear_system_state(); - assert(bd == 8 || bd == 10 || bd == 12); - assert(bd >= in_bd); - assert(src->flags == dst->flags); - const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH; - - bd_shift = bd - in_bd; - - *y_psnrhvs = calc_psnrhvs( - src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par, - src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd); - *u_psnrhvs = - calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, - par, src->uv_crop_width, src->uv_crop_height, step, - csf_cb420, bd_shift, buf_is_hbd); - *v_psnrhvs = - calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, - par, src->uv_crop_width, src->uv_crop_height, step, - csf_cr420, bd_shift, buf_is_hbd); - psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); - return convert_score_db(psnrhvs, 1.0, in_bd); -} diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c deleted file mode 100644 index 62dbd86a9..000000000 --- a/third_party/aom/aom_dsp/quantize.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/quantize.h" -#include "aom_mem/aom_mem.h" - -void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr, const int log_scale) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), - ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - int i, non_zero_count = (int)n_coeffs, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && - coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp32; - - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { - int64_t tmp = - clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), - INT16_MIN, INT16_MAX); - tmp *= wt; - tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); // quantization - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int dequant = - (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - - if (tmp32) eob = i; - } - } - *eob_ptr = eob + 1; -} - -void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr, const int log_scale) { - int i, eob = -1; - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), - ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - int dequant; - int idx_arr[4096]; - (void)iscan; - int idx = 0; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || - coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - if (abs_qcoeff) eob = idx_arr[i]; - } - *eob_ptr = eob + 1; -} - -/* These functions should only be called when quantisation matrices - are not used. */ -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, - eob_ptr, scan, iscan, NULL, NULL, 0); -} - -void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, - eob_ptr, scan, iscan, NULL, NULL, 1); -} - -void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, - eob_ptr, scan, iscan, NULL, NULL, 2); -} - -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, - NULL, NULL, 0); -} - -void aom_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, - NULL, NULL, 1); -} - -void aom_highbd_quantize_b_64x64_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, - NULL, NULL, 2); -} diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h deleted file mode 100644 index c55ab234e..000000000 --- a/third_party/aom/aom_dsp/quantize.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_QUANTIZE_H_ -#define AOM_AOM_DSP_QUANTIZE_H_ - -#include "config/aom_config.h" - -#include "aom_dsp/aom_dsp_common.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr, const int log_scale); - -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); - -void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr, const int log_scale); - -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c deleted file mode 100644 index 1e24df4a5..000000000 --- a/third_party/aom/aom_dsp/sad.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/blend.h" - -/* Sum the difference between every corresponding element of the buffers. */ -static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int y, x; - unsigned int sad = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -#define sadMxh(m) \ - unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, int width, \ - int height) { \ - return sad(a, a_stride, b, b_stride, width, height); \ - } - -#define sadMxN(m, n) \ - unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ - aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ - } \ - unsigned int aom_jnt_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ - return sad(src, src_stride, comp_pred, m, m, n); \ - } - -// Calculate sad against 4 reference locations and store each in sad_array -#define sadMxNx4D(m, n) \ - void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sad_array[i] = \ - aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ - } - -/* clang-format off */ -// 128x128 -sadMxN(128, 128) -sadMxNx4D(128, 128) - -// 128x64 -sadMxN(128, 64) -sadMxNx4D(128, 64) - -// 64x128 -sadMxN(64, 128) -sadMxNx4D(64, 128) - -// 64x64 -sadMxN(64, 64) -sadMxNx4D(64, 64) - -// 64x32 -sadMxN(64, 32) -sadMxNx4D(64, 32) - -// 32x64 -sadMxN(32, 64) -sadMxNx4D(32, 64) - -// 32x32 -sadMxN(32, 32) -sadMxNx4D(32, 32) - -// 32x16 -sadMxN(32, 16) -sadMxNx4D(32, 16) - -// 16x32 -sadMxN(16, 32) -sadMxNx4D(16, 32) - -// 16x16 -sadMxN(16, 16) -sadMxNx4D(16, 16) - -// 16x8 -sadMxN(16, 8) -sadMxNx4D(16, 8) - -// 8x16 -sadMxN(8, 16) -sadMxNx4D(8, 16) - -// 8x8 -sadMxN(8, 8) -sadMxNx4D(8, 8) - -// 8x4 -sadMxN(8, 4) -sadMxNx4D(8, 4) - -// 4x8 -sadMxN(4, 8) -sadMxNx4D(4, 8) - -// 4x4 -sadMxN(4, 4) -sadMxNx4D(4, 4) - -sadMxh(128); -sadMxh(64); -sadMxh(32); -sadMxh(16); -sadMxh(8); -sadMxh(4); - -sadMxN(4, 16) -sadMxNx4D(4, 16) -sadMxN(16, 4) -sadMxNx4D(16, 4) -sadMxN(8, 32) -sadMxNx4D(8, 32) -sadMxN(32, 8) -sadMxNx4D(32, 8) -sadMxN(16, 64) -sadMxNx4D(16, 64) -sadMxN(64, 16) -sadMxNx4D(64, 16) - - /* clang-format on */ - - static INLINE - unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, - const uint16_t *b, int b_stride, - int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -#define highbd_sadMxN(m, n) \ - unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, \ - int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int aom_highbd_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ - aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \ - ref, ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ - } \ - unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t comp_pred[m * n]; \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \ - m, n, ref, ref_stride, jcp_param); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ - } - -#define highbd_sadMxNx4D(m, n) \ - void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ - ref_array[i], ref_stride); \ - } \ - } - -/* clang-format off */ -// 128x128 -highbd_sadMxN(128, 128) -highbd_sadMxNx4D(128, 128) - -// 128x64 -highbd_sadMxN(128, 64) -highbd_sadMxNx4D(128, 64) - -// 64x128 -highbd_sadMxN(64, 128) -highbd_sadMxNx4D(64, 128) - -// 64x64 -highbd_sadMxN(64, 64) -highbd_sadMxNx4D(64, 64) - -// 64x32 -highbd_sadMxN(64, 32) -highbd_sadMxNx4D(64, 32) - -// 32x64 -highbd_sadMxN(32, 64) -highbd_sadMxNx4D(32, 64) - -// 32x32 -highbd_sadMxN(32, 32) -highbd_sadMxNx4D(32, 32) - -// 32x16 -highbd_sadMxN(32, 16) -highbd_sadMxNx4D(32, 16) - -// 16x32 -highbd_sadMxN(16, 32) -highbd_sadMxNx4D(16, 32) - -// 16x16 -highbd_sadMxN(16, 16) -highbd_sadMxNx4D(16, 16) - -// 16x8 -highbd_sadMxN(16, 8) -highbd_sadMxNx4D(16, 8) - -// 8x16 -highbd_sadMxN(8, 16) -highbd_sadMxNx4D(8, 16) - -// 8x8 -highbd_sadMxN(8, 8) -highbd_sadMxNx4D(8, 8) - -// 8x4 -highbd_sadMxN(8, 4) -highbd_sadMxNx4D(8, 4) - -// 4x8 -highbd_sadMxN(4, 8) -highbd_sadMxNx4D(4, 8) - -// 4x4 -highbd_sadMxN(4, 4) -highbd_sadMxNx4D(4, 4) - -highbd_sadMxN(4, 16) -highbd_sadMxNx4D(4, 16) -highbd_sadMxN(16, 4) -highbd_sadMxNx4D(16, 4) -highbd_sadMxN(8, 32) -highbd_sadMxNx4D(8, 32) -highbd_sadMxN(32, 8) -highbd_sadMxNx4D(32, 8) -highbd_sadMxN(16, 64) -highbd_sadMxNx4D(16, 64) -highbd_sadMxN(64, 16) -highbd_sadMxNx4D(64, 16) - /* clang-format on */ diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c deleted file mode 100644 index c176001d6..000000000 --- a/third_party/aom/aom_dsp/sad_av1.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/blend.h" - -static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, - const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, int width, - int height) { - int y, x; - unsigned int sad = 0; - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); - sad += abs(pred - src[x]); - } - src += src_stride; - a += a_stride; - b += b_stride; - m += m_stride; - } - sad = (sad + 31) >> 6; - return sad; -} - -#define MASKSADMxN(m, n) \ - unsigned int aom_masked_sad##m##x##n##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - if (!invert_mask) \ - return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ - msk_stride, m, n); \ - else \ - return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ - msk_stride, m, n); \ - } - -/* clang-format off */ -MASKSADMxN(128, 128) -MASKSADMxN(128, 64) -MASKSADMxN(64, 128) -MASKSADMxN(64, 64) -MASKSADMxN(64, 32) -MASKSADMxN(32, 64) -MASKSADMxN(32, 32) -MASKSADMxN(32, 16) -MASKSADMxN(16, 32) -MASKSADMxN(16, 16) -MASKSADMxN(16, 8) -MASKSADMxN(8, 16) -MASKSADMxN(8, 8) -MASKSADMxN(8, 4) -MASKSADMxN(4, 8) -MASKSADMxN(4, 4) -MASKSADMxN(4, 16) -MASKSADMxN(16, 4) -MASKSADMxN(8, 32) -MASKSADMxN(32, 8) -MASKSADMxN(16, 64) -MASKSADMxN(64, 16) - - /* clang-format on */ - - static INLINE - unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, - const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, int width, - int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); - sad += abs(pred - src[x]); - } - - src += src_stride; - a += a_stride; - b += b_stride; - m += m_stride; - } - sad = (sad + 31) >> 6; - - return sad; -} - -#define HIGHBD_MASKSADMXN(m, n) \ - unsigned int aom_highbd_masked_sad##m##x##n##_c( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ - int msk_stride, int invert_mask) { \ - if (!invert_mask) \ - return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ - second_pred8, m, msk, msk_stride, m, n); \ - else \ - return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ - ref_stride, msk, msk_stride, m, n); \ - } - -HIGHBD_MASKSADMXN(128, 128) -HIGHBD_MASKSADMXN(128, 64) -HIGHBD_MASKSADMXN(64, 128) -HIGHBD_MASKSADMXN(64, 64) -HIGHBD_MASKSADMXN(64, 32) -HIGHBD_MASKSADMXN(32, 64) -HIGHBD_MASKSADMXN(32, 32) -HIGHBD_MASKSADMXN(32, 16) -HIGHBD_MASKSADMXN(16, 32) -HIGHBD_MASKSADMXN(16, 16) -HIGHBD_MASKSADMXN(16, 8) -HIGHBD_MASKSADMXN(8, 16) -HIGHBD_MASKSADMXN(8, 8) -HIGHBD_MASKSADMXN(8, 4) -HIGHBD_MASKSADMXN(4, 8) -HIGHBD_MASKSADMXN(4, 4) -HIGHBD_MASKSADMXN(4, 16) -HIGHBD_MASKSADMXN(16, 4) -HIGHBD_MASKSADMXN(8, 32) -HIGHBD_MASKSADMXN(32, 8) -HIGHBD_MASKSADMXN(16, 64) -HIGHBD_MASKSADMXN(64, 16) - -// pre: predictor being evaluated -// wsrc: target weighted prediction (has been *4096 to keep precision) -// mask: 2d weights (scaled by 4096) -static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, - const int32_t *wsrc, const int32_t *mask, - int width, int height) { - int y, x; - unsigned int sad = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - - pre += pre_stride; - wsrc += width; - mask += width; - } - - return sad; -} - -#define OBMCSADMxN(m, n) \ - unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ - const int32_t *wsrc, \ - const int32_t *mask) { \ - return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ - } - -/* clang-format off */ -OBMCSADMxN(128, 128) -OBMCSADMxN(128, 64) -OBMCSADMxN(64, 128) -OBMCSADMxN(64, 64) -OBMCSADMxN(64, 32) -OBMCSADMxN(32, 64) -OBMCSADMxN(32, 32) -OBMCSADMxN(32, 16) -OBMCSADMxN(16, 32) -OBMCSADMxN(16, 16) -OBMCSADMxN(16, 8) -OBMCSADMxN(8, 16) -OBMCSADMxN(8, 8) -OBMCSADMxN(8, 4) -OBMCSADMxN(4, 8) -OBMCSADMxN(4, 4) -OBMCSADMxN(4, 16) -OBMCSADMxN(16, 4) -OBMCSADMxN(8, 32) -OBMCSADMxN(32, 8) -OBMCSADMxN(16, 64) -OBMCSADMxN(64, 16) - /* clang-format on */ - - static INLINE - unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, const int32_t *mask, - int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - - pre += pre_stride; - wsrc += width; - mask += width; - } - - return sad; -} - -#define HIGHBD_OBMCSADMXN(m, n) \ - unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ - const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ - const int32_t *mask) { \ - return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ - } - -/* clang-format off */ -HIGHBD_OBMCSADMXN(128, 128) -HIGHBD_OBMCSADMXN(128, 64) -HIGHBD_OBMCSADMXN(64, 128) -HIGHBD_OBMCSADMXN(64, 64) -HIGHBD_OBMCSADMXN(64, 32) -HIGHBD_OBMCSADMXN(32, 64) -HIGHBD_OBMCSADMXN(32, 32) -HIGHBD_OBMCSADMXN(32, 16) -HIGHBD_OBMCSADMXN(16, 32) -HIGHBD_OBMCSADMXN(16, 16) -HIGHBD_OBMCSADMXN(16, 8) -HIGHBD_OBMCSADMXN(8, 16) -HIGHBD_OBMCSADMXN(8, 8) -HIGHBD_OBMCSADMXN(8, 4) -HIGHBD_OBMCSADMXN(4, 8) -HIGHBD_OBMCSADMXN(4, 4) -HIGHBD_OBMCSADMXN(4, 16) -HIGHBD_OBMCSADMXN(16, 4) -HIGHBD_OBMCSADMXN(8, 32) -HIGHBD_OBMCSADMXN(32, 8) -HIGHBD_OBMCSADMXN(16, 64) -HIGHBD_OBMCSADMXN(64, 16) -/* clang-format on */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h deleted file mode 100644 index 01dbb8fd2..000000000 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "aom_dsp/simd/v128_intrinsics_c.h" -#include "aom_dsp/simd/v64_intrinsics.h" - -/* Fallback to plain, unoptimised C. */ - -typedef c_v128 v128; - -SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } -SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } -SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } -SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { - return c_v128_from_64(hi, lo); -} -SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { - return c_v128_from_v64(hi, lo); -} -SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return c_v128_from_32(a, b, c, d); -} - -SIMD_INLINE v128 v128_load_unaligned(const void *p) { - return c_v128_load_unaligned(p); -} -SIMD_INLINE v128 v128_load_aligned(const void *p) { - return c_v128_load_aligned(p); -} - -SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { - c_v128_store_unaligned(p, a); -} -SIMD_INLINE void v128_store_aligned(void *p, v128 a) { - c_v128_store_aligned(p, a); -} - -SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { - return c_v128_align(a, b, c); -} - -SIMD_INLINE v128 v128_zero() { return c_v128_zero(); } -SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } -SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } -SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } -SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); } - -typedef uint32_t sad128_internal; -SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); } -SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { - return c_v128_sad_u8(s, a, b); -} -SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { - return c_v128_sad_u8_sum(s); -} -typedef uint32_t ssd128_internal; -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); } -SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - return c_v128_ssd_u8(s, a, b); -} -SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { - return c_v128_ssd_u8_sum(s); -} -SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { - return c_v128_dotp_su8(a, b); -} -SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { - return c_v128_dotp_s16(a, b); -} -SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { - return c_v128_dotp_s32(a, b); -} -SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } - -SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } -SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } -SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } -SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } - -SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } -SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } -SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); } -SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); } -SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } -SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } -SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); } -SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); } -SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } -SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } -SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } -SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } -SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } -SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } -SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } -SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } -SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); } -SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } -SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } - -SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } -SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { - return c_v128_mullo_s16(a, b); -} -SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { - return c_v128_mulhi_s16(a, b); -} -SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { - return c_v128_mullo_s32(a, b); -} -SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } -SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } - -SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); } -SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { - return c_v128_blend_8(a, b, c); -} - -SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } -SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } -SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { - return c_v128_rdavg_u16(a, b); -} -SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } -SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } -SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } -SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } -SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } -SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } -SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } -SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); } -SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); } - -SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } -SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } -SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } -SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } -SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } -SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } -SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } -SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } -SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } -SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } -SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } -SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { - return c_v128_unziplo_8(a, b); -} -SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { - return c_v128_unziphi_8(a, b); -} -SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { - return c_v128_unziplo_16(a, b); -} -SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { - return c_v128_unziphi_16(a, b); -} -SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { - return c_v128_unziplo_32(a, b); -} -SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { - return c_v128_unziphi_32(a, b); -} -SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } -SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { - return c_v128_unpacklo_u8_s16(a); -} -SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { - return c_v128_unpackhi_u8_s16(a); -} -SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } -SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { - return c_v128_unpacklo_s8_s16(a); -} -SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { - return c_v128_unpackhi_s8_s16(a); -} -SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { - return c_v128_pack_s32_s16(a, b); -} -SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { - return c_v128_pack_s32_u16(a, b); -} -SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { - return c_v128_pack_s16_u8(a, b); -} -SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { - return c_v128_pack_s16_s8(a, b); -} -SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } -SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } -SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { - return c_v128_unpacklo_u16_s32(a); -} -SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { - return c_v128_unpacklo_s16_s32(a); -} -SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { - return c_v128_unpackhi_u16_s32(a); -} -SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { - return c_v128_unpackhi_s16_s32(a); -} -SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { - return c_v128_shuffle_8(a, pattern); -} - -SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } -SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } -SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } -SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { - return c_v128_cmpgt_s16(a, b); -} -SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { - return c_v128_cmplt_s16(a, b); -} -SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } - -SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { - return c_v128_cmpgt_s32(a, b); -} -SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { - return c_v128_cmplt_s32(a, b); -} -SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); } - -SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return c_v128_shl_8(a, c); -} -SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return c_v128_shr_u8(a, c); -} -SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - return c_v128_shr_s8(a, c); -} -SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { - return c_v128_shl_16(a, c); -} -SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { - return c_v128_shr_u16(a, c); -} -SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { - return c_v128_shr_s16(a, c); -} -SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { - return c_v128_shl_32(a, c); -} -SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { - return c_v128_shr_u32(a, c); -} -SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { - return c_v128_shr_s32(a, c); -} -SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { - return c_v128_shl_64(a, c); -} -SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { - return c_v128_shr_u64(a, c); -} -SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { - return c_v128_shr_s64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - return c_v128_shr_n_byte(a, n); -} -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - return c_v128_shl_n_byte(a, n); -} -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { - return c_v128_shl_n_8(a, n); -} -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { - return c_v128_shl_n_16(a, n); -} -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { - return c_v128_shl_n_32(a, n); -} -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) { - return c_v128_shl_n_64(a, n); -} -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { - return c_v128_shr_n_u8(a, n); -} -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { - return c_v128_shr_n_u16(a, n); -} -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { - return c_v128_shr_n_u32(a, n); -} -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) { - return c_v128_shr_n_u64(a, n); -} -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { - return c_v128_shr_n_s8(a, n); -} -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { - return c_v128_shr_n_s16(a, n); -} -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { - return c_v128_shr_n_s32(a, n); -} -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) { - return c_v128_shr_n_s64(a, n); -} - -typedef uint32_t sad128_internal_u16; -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { - return c_v128_sad_u16_init(); -} -SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, - v128 b) { - return c_v128_sad_u16(s, a, b); -} -SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { - return c_v128_sad_u16_sum(s); -} - -typedef uint64_t ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { - return c_v128_ssd_s16_init(); -} -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, - v128 b) { - return c_v128_ssd_s16(s, a, b); -} -SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { - return c_v128_ssd_s16_sum(s); -} - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h deleted file mode 100644 index 3c669d579..000000000 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h +++ /dev/null @@ -1,958 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ - -#include <arm_neon.h> - -#include "aom_dsp/simd/v64_intrinsics_arm.h" - -typedef int64x2_t v128; - -SIMD_INLINE uint32_t v128_low_u32(v128 a) { - return v64_low_u32(vget_low_s64(a)); -} - -SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } - -SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } - -SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } - -SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return vcombine_s64((int64x1_t)b, (int64x1_t)a); -} - -SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); -} - -SIMD_INLINE v128 v128_load_aligned(const void *p) { - return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); -} - -SIMD_INLINE v128 v128_load_unaligned(const void *p) { - return v128_load_aligned(p); -} - -SIMD_INLINE void v128_store_aligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpretq_s64_s8( - vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) - : b; -#else - return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), - v64_align(v128_high_v64(b), v128_low_v64(b), c)) - : v128_from_v64( - v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), - v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); -#endif -} - -SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } - -SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); } - -SIMD_INLINE v128 v128_dup_8(uint8_t x) { - return vreinterpretq_s64_u8(vdupq_n_u8(x)); -} - -SIMD_INLINE v128 v128_dup_16(uint16_t x) { - return vreinterpretq_s64_u16(vdupq_n_u16(x)); -} - -SIMD_INLINE v128 v128_dup_32(uint32_t x) { - return vreinterpretq_s64_u32(vdupq_n_u32(x)); -} - -SIMD_INLINE v128 v128_dup_64(uint64_t x) { - return vreinterpretq_s64_u64(vdupq_n_u64(x)); -} - -SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { - int16x8_t t1 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b))))); - int16x8_t t2 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b))))); -#if defined(__aarch64__) - return vaddlvq_s16(t1) + vaddlvq_s16(t2); -#else - int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2))); - return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); -#endif -} - -SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { - return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + - v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); -} - -SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { - int64x2_t t = vpaddlq_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); -} - -SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { -#if defined(__aarch64__) - return vaddlvq_u8(vreinterpretq_u8_s64(x)); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); - return vget_lane_s32( - vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v128 v128_padd_s16(v128 a) { - return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); -} - -SIMD_INLINE v128 v128_padd_u8(v128 a) { - return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a))); -} - -typedef struct { - sad64_internal hi, lo; -} sad128_internal; - -SIMD_INLINE sad128_internal v128_sad_u8_init() { - sad128_internal s; - s.hi = s.lo = vdupq_n_u16(0); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - v128_sad_u8_sum(). - The result for more than 32 v128_sad_u8() calls is undefined. */ -SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { - sad128_internal r; - r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { -#if defined(__aarch64__) - return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo))); - return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t)); -#endif -} - -typedef struct { - ssd64_internal hi, lo; -} ssd128_internal; - -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { - ssd128_internal s; - s.hi = s.lo = v64_ssd_u8_init(); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_u8_sum(). */ -SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - ssd128_internal r; - r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { - return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); -} - -SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } - -SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } - -SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } - -SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } - -SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); -} - -SIMD_INLINE v128 v128_add_64(v128 x, v128 y) { - return vreinterpretq_s64_u64( - vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y))); -} - -SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); } - -SIMD_INLINE v128 v128_abs_s16(v128 x) { - return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); -} - -SIMD_INLINE v128 v128_abs_s8(v128 x) { - return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); -} - -SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { - return vreinterpretq_s64_s32( - vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { - return vreinterpretq_s64_s16( - vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { -#if defined(__aarch64__) - return vreinterpretq_s64_s16(vuzp2q_s16( - vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b)))), - vreinterpretq_s16_s32( - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))))); -#else - return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), - v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { - return vreinterpretq_s64_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); -} - -SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { -#if defined(__aarch64__) - int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b))); - int32x4_t t2 = - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)); - return vreinterpretq_s64_s32(vpaddq_s32(t1, t2)); -#else - return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), - v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { -#if defined(__aarch64__) - int16x8_t t1 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b)))); - int16x8_t t2 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b)))); - return vreinterpretq_s64_s16( - vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2))); -#else - return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), - v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE uint32_t v128_movemask_8(v128 a) { - a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0))); -#if defined(__aarch64__) - uint8x16_t m = - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); - return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8); -#else - uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); - return v64_low_u32( - v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); -#endif -} - -SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { - c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0))); - return v128_or(v128_and(b, c), v128_andn(a, c)); -} - -SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u8( - vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u8( - vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u16( - vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u16( - vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { - uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u32( - vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u32( - vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { - uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); - return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { - return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b)); -} - -SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { - return v128_from_v64(vget_high_s64((int64x2_t)a), - vget_high_s64((int64x2_t)b)); -} - -SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u8( - vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u8( - vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u16( - vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u16( - vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u32( - vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { -#if defined(__aarch64__) - return vreinterpretq_s64_u32( - vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { - return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); -} - -SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { - return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { -#if defined(__aarch64__) - return vreinterpretq_s64_u8( - vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern))); -#else - uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)), - vget_high_u8(vreinterpretq_u8_s64(x)) } }; - return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8( - p, vreinterpret_u8_s64(vget_high_s64(pattern)))), - (uint64_t)vreinterpret_s64_u8(vtbl2_u8( - p, vreinterpret_u8_s64(vget_low_s64(pattern))))); -#endif -} - -SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8( - vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c))); -} - -SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8( - vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); -} - -SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - return (c > 7) ? v128_ones() - : vreinterpretq_s64_s8( - vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); -} - -SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16( - vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c))); -} - -SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16( - vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c))); -} - -SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { - return (c > 15) ? v128_ones() - : vreinterpretq_s64_s16( - vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c))); -} - -SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32( - vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c))); -} - -SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32( - vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c))); -} - -SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { - return (c > 31) ? v128_ones() - : vreinterpretq_s64_s32( - vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); -} - -SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64( - vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c))); -} - -SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64( - vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c))); -} - -SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { - return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c)); -} - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - return n < 8 - ? v128_from_64( - (uint64_t)vorr_u64( - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - n * 8), - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - (8 - n) * 8)), - (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - n * 8)) - : (n == 8 ? v128_from_64( - (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) - : v128_from_64((uint64_t)vshl_n_u64( - vreinterpret_u64_s64(vget_low_s64(a)), - (n - 8) * 8), - 0)); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - return n < 8 - ? v128_from_64( - (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - n * 8), - (uint64_t)vorr_u64( - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8), - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - (8 - n) * 8))) - : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64( - vget_high_s64(a))) - : v128_from_64( - 0, (uint64_t)vshr_n_u64( - vreinterpret_u64_s64(vget_high_s64(a)), - (n - 8) * 8))); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)); -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)); -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)); -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c)); -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return vshrq_n_s64(a, c); -} - -#else - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), - v64_shr_n_byte(v128_low_v64(a), 8 - n)), - v64_shl_n_byte(v128_low_v64(a), n)); - else - return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), - v64_or(v64_shr_n_byte(v128_low_v64(a), n), - v64_shl_n_byte(v128_high_v64(a), 8 - n))); - else - return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return v128_shl_8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return v128_shr_u8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return v128_shr_s8(a, c); -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return v128_shl_16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return v128_shr_u16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return v128_shr_s16(a, c); -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return v128_shl_32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return v128_shr_u32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return v128_shr_s32(a, c); -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return v128_shl_64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return v128_shr_u64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return v128_shr_s64(a, c); -} - -#endif - -typedef uint32x4_t sad128_internal_u16; - -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); } - -/* Implementation dependent return value. Result must be finalised with - * v128_sad_u16_sum(). */ -SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, - v128 b) { - return vaddq_u32( - s, vpaddlq_u16(vsubq_u16( - vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)), - vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b))))); -} - -SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { - uint64x2_t t = vpaddlq_u32(s); - return (uint32_t)(uint64_t)vget_high_u64(t) + - (uint32_t)(uint64_t)vget_low_u64(t); -} - -typedef v128 ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_s16_sum(). */ -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, - v128 b) { - v128 d = v128_sub_16(a, b); - d = v128_madd_s16(d, d); - return v128_add_64( - s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d)))); -} - -SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { - return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); -} - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h deleted file mode 100644 index bbe9a9d28..000000000 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h +++ /dev/null @@ -1,888 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ - -#include <stdio.h> -#include <stdlib.h> - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v64_intrinsics_c.h" - -typedef union { - uint8_t u8[16]; - uint16_t u16[8]; - uint32_t u32[4]; - uint64_t u64[2]; - int8_t s8[16]; - int16_t s16[8]; - int32_t s32[4]; - int64_t s64[2]; - c_v64 v64[2]; -} c_v128; - -SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } - -SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } - -SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } - -SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { - c_v128 t; - t.u64[1] = hi; - t.u64[0] = lo; - return t; -} - -SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { - c_v128 t; - t.v64[1] = hi; - t.v64[0] = lo; - return t; -} - -SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, - uint32_t d) { - c_v128 t; - t.u32[3] = a; - t.u32[2] = b; - t.u32[1] = c; - t.u32[0] = d; - return t; -} - -SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { - c_v128 t; - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&t; - int c; - for (c = 0; c < 16; c++) q[c] = pp[c]; - return t; -} - -SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { - if (SIMD_CHECK && (uintptr_t)p & 15) { - fprintf(stderr, "Error: unaligned v128 load at %p\n", p); - abort(); - } - return c_v128_load_unaligned(p); -} - -SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&a; - int c; - for (c = 0; c < 16; c++) pp[c] = q[c]; -} - -SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { - if (SIMD_CHECK && (uintptr_t)p & 15) { - fprintf(stderr, "Error: unaligned v128 store at %p\n", p); - abort(); - } - c_v128_store_unaligned(p, a); -} - -SIMD_INLINE c_v128 c_v128_zero() { - c_v128 t; - t.u64[1] = t.u64[0] = 0; - return t; -} - -SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { - c_v128 t; - t.v64[1] = t.v64[0] = c_v64_dup_8(x); - return t; -} - -SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { - c_v128 t; - t.v64[1] = t.v64[0] = c_v64_dup_16(x); - return t; -} - -SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { - c_v128 t; - t.v64[1] = t.v64[0] = c_v64_dup_32(x); - return t; -} - -SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { - c_v128 t; - t.u64[1] = t.u64[0] = x; - return t; -} - -SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { - return c_v64_dotp_su8(a.v64[1], b.v64[1]) + - c_v64_dotp_su8(a.v64[0], b.v64[0]); -} - -SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { - return c_v64_dotp_s16(a.v64[1], b.v64[1]) + - c_v64_dotp_s16(a.v64[0], b.v64[0]); -} - -SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { - // 32 bit products, 64 bit sum - return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + - (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + - (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + - (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); -} - -SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { - return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); -} - -typedef uint32_t c_sad128_internal; - -SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - v128_sad_u8_sum(). - The result for more than 32 v128_sad_u8() calls is undefined. */ -SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, - c_v128 b) { - int c; - for (c = 0; c < 16; c++) - s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; - return s; -} - -SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; } - -typedef uint32_t c_ssd128_internal; - -SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_u8_sum(). */ -SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, - c_v128 b) { - int c; - for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); - return s; -} - -SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } - -SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), - c_v64_or(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), - c_v64_xor(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), - c_v64_and(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), - c_v64_andn(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), - c_v64_add_8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), - c_v64_add_16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), - c_v64_sadd_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), - c_v64_sadd_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), - c_v64_sadd_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), - c_v64_add_32(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { - // Two complement overflow (silences sanitizers) - return c_v128_from_64( - a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 - : a.v64[1].u64 + b.v64[1].u64, - a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 - : a.v64[0].u64 + b.v64[0].u64); -} - -SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { - c_v128 t; - t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; - t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; - t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; - t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; - return t; -} - -SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { - c_v128 t; - t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; - t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; - t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; - t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; - t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; - t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; - t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; - t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; - return t; -} - -SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), - c_v64_sub_8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), - c_v64_ssub_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), - c_v64_ssub_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), - c_v64_sub_16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), - c_v64_ssub_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), - c_v64_ssub_u16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), - c_v64_sub_32(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { - // Two complement underflow (silences sanitizers) - return c_v128_from_64( - a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 - : a.v64[1].u64 - b.v64[1].u64, - a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 - : a.v64[0].u64 - b.v64[0].u64); -} - -SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { - return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { - return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { - c_v64 lo_bits = c_v64_mullo_s16(a, b); - c_v64 hi_bits = c_v64_mulhi_s16(a, b); - return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), - c_v64_ziplo_16(hi_bits, lo_bits)); -} - -SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), - c_v64_mullo_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), - c_v64_mulhi_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), - c_v64_mullo_s32(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), - c_v64_madd_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), - c_v64_madd_us8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), - c_v64_avg_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), - c_v64_rdavg_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), - c_v64_rdavg_u16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), - c_v64_avg_u16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), - c_v64_min_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), - c_v64_max_u8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), - c_v64_min_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { - return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | - ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | - ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | - ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | - ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | - ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | - ((a.s8[0] < 0) << 0); -} - -SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { - c_v128 t; - for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; - return t; -} - -SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), - c_v64_max_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), - c_v64_min_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), - c_v64_max_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { - c_v128 t; - int c; - for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; - return t; -} - -SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { - c_v128 t; - int c; - for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; - return t; -} - -SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), - c_v64_ziplo_8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), - c_v64_ziplo_8(a.v64[1], b.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), - c_v64_ziplo_16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), - c_v64_ziplo_16(a.v64[1], b.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), - c_v64_ziplo_32(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), - c_v64_ziplo_32(a.v64[1], b.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { - return c_v128_from_v64(a.v64[0], b.v64[0]); -} - -SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { - return c_v128_from_v64(a.v64[1], b.v64[1]); -} - -SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { - return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); -} - -SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { - return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); -} - -SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { - return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); -} - -SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { - c_v128 t; - if (mode) { - t.u8[15] = b.u8[15]; - t.u8[14] = b.u8[13]; - t.u8[13] = b.u8[11]; - t.u8[12] = b.u8[9]; - t.u8[11] = b.u8[7]; - t.u8[10] = b.u8[5]; - t.u8[9] = b.u8[3]; - t.u8[8] = b.u8[1]; - t.u8[7] = a.u8[15]; - t.u8[6] = a.u8[13]; - t.u8[5] = a.u8[11]; - t.u8[4] = a.u8[9]; - t.u8[3] = a.u8[7]; - t.u8[2] = a.u8[5]; - t.u8[1] = a.u8[3]; - t.u8[0] = a.u8[1]; - } else { - t.u8[15] = a.u8[14]; - t.u8[14] = a.u8[12]; - t.u8[13] = a.u8[10]; - t.u8[12] = a.u8[8]; - t.u8[11] = a.u8[6]; - t.u8[10] = a.u8[4]; - t.u8[9] = a.u8[2]; - t.u8[8] = a.u8[0]; - t.u8[7] = b.u8[14]; - t.u8[6] = b.u8[12]; - t.u8[5] = b.u8[10]; - t.u8[4] = b.u8[8]; - t.u8[3] = b.u8[6]; - t.u8[2] = b.u8[4]; - t.u8[1] = b.u8[2]; - t.u8[0] = b.u8[0]; - } - return t; -} - -SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) - : _c_v128_unzip_8(a, b, 0); -} - -SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) - : _c_v128_unzip_8(b, a, 1); -} - -SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { - c_v128 t; - if (mode) { - t.u16[7] = b.u16[7]; - t.u16[6] = b.u16[5]; - t.u16[5] = b.u16[3]; - t.u16[4] = b.u16[1]; - t.u16[3] = a.u16[7]; - t.u16[2] = a.u16[5]; - t.u16[1] = a.u16[3]; - t.u16[0] = a.u16[1]; - } else { - t.u16[7] = a.u16[6]; - t.u16[6] = a.u16[4]; - t.u16[5] = a.u16[2]; - t.u16[4] = a.u16[0]; - t.u16[3] = b.u16[6]; - t.u16[2] = b.u16[4]; - t.u16[1] = b.u16[2]; - t.u16[0] = b.u16[0]; - } - return t; -} - -SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) - : _c_v128_unzip_16(a, b, 0); -} - -SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) - : _c_v128_unzip_16(b, a, 1); -} - -SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { - c_v128 t; - if (mode) { - t.u32[3] = b.u32[3]; - t.u32[2] = b.u32[1]; - t.u32[1] = a.u32[3]; - t.u32[0] = a.u32[1]; - } else { - t.u32[3] = a.u32[2]; - t.u32[2] = a.u32[0]; - t.u32[1] = b.u32[2]; - t.u32[0] = b.u32[0]; - } - return t; -} - -SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) - : _c_v128_unzip_32(a, b, 0); -} - -SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { - return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) - : _c_v128_unzip_32(b, a, 1); -} - -SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { - return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); -} - -SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), - c_v64_unpacklo_u8_s16(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), - c_v64_unpacklo_u8_s16(a.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { - return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); -} - -SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), - c_v64_unpacklo_s8_s16(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), - c_v64_unpacklo_s8_s16(a.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), - c_v64_pack_s32_s16(b.v64[1], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), - c_v64_pack_s32_u16(b.v64[1], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), - c_v64_pack_s16_u8(b.v64[1], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), - c_v64_pack_s16_s8(b.v64[1], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { - return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); -} - -SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { - return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); -} - -SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), - c_v64_unpacklo_u16_s32(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), - c_v64_unpacklo_s16_s32(a.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), - c_v64_unpacklo_u16_s32(a.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { - return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), - c_v64_unpacklo_s16_s32(a.v64[1])); -} - -SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { - c_v128 t; - int c; - for (c = 0; c < 16; c++) - t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) - : pattern.u8[c] & 15]; - - return t; -} - -SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), - c_v64_cmpgt_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), - c_v64_cmplt_s8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), - c_v64_cmpeq_8(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), - c_v64_cmpgt_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), - c_v64_cmplt_s16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { - return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), - c_v64_cmpeq_16(a.v64[0], b.v64[0])); -} - -SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { - c_v128 t; - int c; - for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); - return t; -} - -SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { - c_v128 t; - int c; - for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); - return t; -} - -SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { - c_v128 t; - int c; - for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); - return t; -} - -SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { - if (n < 8) - return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), - c_v64_shr_n_byte(a.v64[0], 8 - n)), - c_v64_shl_n_byte(a.v64[0], n)); - else - return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); -} - -SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { - if (n < 8) - return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), - c_v64_or(c_v64_shr_n_byte(a.v64[0], n), - c_v64_shl_n_byte(a.v64[1], 8 - n))); - else - return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); -} - -SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { - if (SIMD_CHECK && c > 15) { - fprintf(stderr, "Error: undefined alignment %d\n", c); - abort(); - } - return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) - : b; -} - -SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), - c_v64_shr_u16(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), - c_v64_shr_s16(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), - c_v64_shr_u32(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { - return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), - c_v64_shr_s32(a.v64[0], c)); -} - -SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { - a.v64[1].u64 <<= c; - a.v64[0].u64 <<= c; - return c_v128_from_v64(a.v64[1], a.v64[0]); -} - -SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { - a.v64[1].u64 >>= c; - a.v64[0].u64 >>= c; - return c_v128_from_v64(a.v64[1], a.v64[0]); -} - -SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { - a.v64[1].s64 >>= c; - a.v64[0].s64 >>= c; - return c_v128_from_v64(a.v64[1], a.v64[0]); -} - -SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { - return c_v128_shl_8(a, n); -} - -SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { - return c_v128_shl_16(a, n); -} - -SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { - return c_v128_shl_32(a, n); -} - -SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { - return c_v128_shl_64(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { - return c_v128_shr_u8(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { - return c_v128_shr_u16(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { - return c_v128_shr_u32(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { - return c_v128_shr_u64(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { - return c_v128_shr_s8(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { - return c_v128_shr_s16(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { - return c_v128_shr_s32(a, n); -} - -SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { - return c_v128_shr_s64(a, n); -} - -typedef uint32_t c_sad128_internal_u16; - -SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v128_sad_u16_sum(). */ -SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, - c_v128 a, c_v128 b) { - int c; - for (c = 0; c < 8; c++) - s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; - return s; -} - -SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } - -typedef uint64_t c_ssd128_internal_s16; - -SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_s16_sum(). */ -SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, - c_v128 a, c_v128 b) { - int c; - for (c = 0; c < 8; c++) - s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * - (int32_t)(int16_t)(a.s16[c] - b.s16[c]); - return s; -} - -SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h deleted file mode 100644 index 6c7241ff4..000000000 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h +++ /dev/null @@ -1,656 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ - -#include <stdint.h> -#include "aom_dsp/simd/v64_intrinsics_x86.h" - -typedef __m128i v128; - -SIMD_INLINE uint32_t v128_low_u32(v128 a) { - return (uint32_t)_mm_cvtsi128_si32(a); -} - -SIMD_INLINE v64 v128_low_v64(v128 a) { - return _mm_unpacklo_epi64(a, v64_zero()); -} - -SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } - -SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { - return _mm_unpacklo_epi64(b, a); -} - -SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return v128_from_v64(v64_from_64(a), v64_from_64(b)); -} - -SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return _mm_set_epi32(a, b, c, d); -} - -SIMD_INLINE v128 v128_load_aligned(const void *p) { - return _mm_load_si128((__m128i *)p); -} - -SIMD_INLINE v128 v128_load_unaligned(const void *p) { -#if defined(__SSSE3__) - return (__m128i)_mm_lddqu_si128((__m128i *)p); -#else - return _mm_loadu_si128((__m128i *)p); -#endif -} - -SIMD_INLINE void v128_store_aligned(void *p, v128 a) { - _mm_store_si128((__m128i *)p, a); -} - -SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { - _mm_storeu_si128((__m128i *)p, a); -} - -// The following function requires an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) -#if defined(__SSSE3__) -SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { - return c ? _mm_alignr_epi8(a, b, c) : b; -} -#else -#define v128_align(a, b, c) \ - ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) -#endif -#else -#if defined(__SSSE3__) -#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) -#else -#define v128_align(a, b, c) \ - ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) -#endif -#endif - -SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } - -SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); } - -SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); } - -SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); } - -SIMD_INLINE v128 v128_dup_64(uint64_t x) { - // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers - return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x); -} - -SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } - -SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } - -SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } - -SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } - -SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } - -SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } - -SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } - -SIMD_INLINE v128 v128_padd_s16(v128 a) { - return _mm_madd_epi16(a, _mm_set1_epi16(1)); -} - -SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } - -SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } - -SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } - -SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } - -SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } - -SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } - -SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } - -SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } - -SIMD_INLINE v128 v128_abs_s16(v128 a) { -#if defined(__SSSE3__) - return _mm_abs_epi16(a); -#else - return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); -#endif -} - -SIMD_INLINE v128 v128_abs_s8(v128 a) { -#if defined(__SSSE3__) - return _mm_abs_epi8(a); -#else - v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); - return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); -#endif -} - -SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { - return _mm_unpacklo_epi8(b, a); -} - -SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { - return _mm_unpackhi_epi8(b, a); -} - -SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { - return _mm_unpacklo_epi16(b, a); -} - -SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { - return _mm_unpackhi_epi16(b, a); -} - -SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { - return _mm_unpacklo_epi32(b, a); -} - -SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { - return _mm_unpackhi_epi32(b, a); -} - -SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { - return _mm_unpacklo_epi64(b, a); -} - -SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { - return _mm_unpackhi_epi64(b, a); -} - -SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } - -SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } - -SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } - -SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { - return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); -} - -SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { -#if defined(__SSSE3__) -#ifdef __x86_64__ - v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); -#else - v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); -#endif - return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), - _mm_shuffle_epi8(a, order)); -#else - return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); -#endif -} - -SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { - return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); -} - -SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { -#if defined(__SSSE3__) -#ifdef __x86_64__ - v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); -#else - v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); -#endif - return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), - _mm_shuffle_epi8(a, order)); -#else - return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); -#endif -} - -SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { - return _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); -} - -SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { - return _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); -} - -SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { - return _mm_unpackhi_epi8(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { - return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); -} - -SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { - return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); -} - -SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { - return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); -} - -SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { - return _mm_packs_epi32(b, a); -} - -SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_packus_epi32(b, a); -#else - return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), - v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); -#endif -} - -SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { - return _mm_packus_epi16(b, a); -} - -SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { - return _mm_packs_epi16(b, a); -} - -SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { - return _mm_unpacklo_epi16(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { - return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); -} - -SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { - return _mm_unpacklo_epi16(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { - return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); -} - -SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { - return _mm_unpackhi_epi16(a, _mm_setzero_si128()); -} - -SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { - return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); -} - -SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(x, pattern); -#else - v128 output; - unsigned char *input = (unsigned char *)&x; - unsigned char *index = (unsigned char *)&pattern; - char *selected = (char *)&output; - int counter; - - for (counter = 0; counter < 16; counter++) { - selected[counter] = input[index[counter] & 15]; - } - - return output; -#endif -} - -SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { - v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); - v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); - v128 t = v128_add_32(t1, t2); - t = v128_add_32(t, _mm_srli_si128(t, 8)); - t = v128_add_32(t, _mm_srli_si128(t, 4)); - return (int32_t)v128_low_u32(t); -} - -SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { - v128 r = _mm_madd_epi16(a, b); -#if defined(__SSE4_1__) && defined(__x86_64__) - v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), - _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); - return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); -#else - return (int64_t)_mm_cvtsi128_si32(r) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); -#endif -} - -SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { - v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); - return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); -} - -typedef v128 sad128_internal; - -SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } - -/* Implementation dependent return value. Result must be finalised with - v128_sad_sum(). - The result for more than 32 v128_sad_u8() calls is undefined. */ -SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { - return _mm_add_epi64(s, _mm_sad_epu8(a, b)); -} - -SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { - return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); -} - -typedef int32_t ssd128_internal; - -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_sum(). */ -SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - v128 z = _mm_setzero_si128(); - v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); - v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); - v128 rl = _mm_madd_epi16(l, l); - v128 rh = _mm_madd_epi16(h, h); - v128 r = _mm_add_epi32(rl, rh); - r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); - r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); - return s + _mm_cvtsi128_si32(r); -} - -SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } - -SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } - -SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } - -SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } - -SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } - -SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { - v64 lo_bits = v64_mullo_s16(a, b); - v64 hi_bits = v64_mulhi_s16(a, b); - return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), - v64_ziplo_16(hi_bits, lo_bits)); -} - -SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { - return _mm_mullo_epi16(a, b); -} - -SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { - return _mm_mulhi_epi16(a, b); -} - -SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_mullo_epi32(a, b); -#else - return _mm_unpacklo_epi32( - _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), - _mm_shuffle_epi32( - _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); -#endif -} - -SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { - v128 r = v128_mullo_s32(a, b); - return (int64_t)_mm_cvtsi128_si32(r) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); -} - -SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } - -SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { -#if defined(__SSSE3__) - return _mm_maddubs_epi16(a, b); -#else - return _mm_packs_epi32( - _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), - _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), - _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), - _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); -#endif -} - -SIMD_INLINE v128 v128_padd_u8(v128 a) { - return v128_madd_us8(a, _mm_set1_epi8(1)); -} - -SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } - -SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { - return _mm_sub_epi8(_mm_avg_epu8(a, b), - _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); -} - -SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { - return _mm_sub_epi16(_mm_avg_epu16(a, b), - _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); -} - -SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } - -SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } - -SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } - -SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_min_epi8(a, b); -#else - v128 mask = _mm_cmplt_epi8(a, b); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } - -SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { -#if defined(__SSE4_1__) - return _mm_blendv_epi8(a, b, c); -#else - c = _mm_cmplt_epi8(c, v128_zero()); - return v128_or(v128_and(b, c), v128_andn(a, c)); -#endif -} - -SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_max_epi8(a, b); -#else - v128 mask = _mm_cmplt_epi8(b, a); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } - -SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } - -SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_min_epi32(a, b); -#else - v128 mask = _mm_cmplt_epi32(a, b); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { -#if defined(__SSE4_1__) - return _mm_max_epi32(a, b); -#else - v128 mask = _mm_cmplt_epi32(b, a); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } - -SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } - -SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } - -SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { - return _mm_cmpgt_epi16(a, b); -} - -SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { - return _mm_cmplt_epi16(a, b); -} - -SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } - -SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { - return _mm_cmpgt_epi32(a, b); -} - -SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { - return _mm_cmplt_epi32(a, b); -} - -SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } - -SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), - _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return _mm_and_si128(_mm_set1_epi8(0xff >> c), - _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - __m128i x = _mm_cvtsi32_si128(c + 8); - return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), - _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); -} - -SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { - return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { - return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { - return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { - return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { - return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { - return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { - return _mm_sll_epi64(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { - return _mm_srl_epi64(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { - // _mm_sra_epi64 is missing in gcc? - return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c, - (int64_t)v64_u64(v128_low_v64(a)) >> c); - // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c)); -} - -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) -#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) -#define v128_shl_n_8(a, c) \ - _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) -#define v128_shr_n_u8(a, c) \ - _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) -#define v128_shr_n_s8(a, c) \ - _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ - _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) -#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) -#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) -#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) -#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) -#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) -#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) -#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) -#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) -#define v128_shr_n_s64(a, c) \ - v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? - -typedef v128 sad128_internal_u16; - -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v128_sad_u16_sum(). */ -SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, - v128 b) { -#if defined(__SSE4_1__) - v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); -#else - v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), - v128_xor(b, v128_dup_16(32768))); - t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), - v128_or(v128_and(a, t), v128_andn(b, t))); -#endif - return v128_add_32( - s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); -} - -SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { - return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + - v128_low_u32(v128_shr_n_byte(s, 8)) + - v128_low_u32(v128_shr_n_byte(s, 12)); -} - -typedef v128 ssd128_internal_s16; - -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_s16_sum(). */ -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, - v128 b) { - v128 d = v128_sub_16(a, b); - d = v128_madd_s16(d, d); - return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), - _mm_unpacklo_epi32(d, v128_zero()))); -} - -SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { - return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); -} - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h deleted file mode 100644 index cb99d35b7..000000000 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "aom_dsp/simd/v256_intrinsics_c.h" -#include "aom_dsp/simd/v128_intrinsics.h" -#include "aom_dsp/simd/v64_intrinsics.h" - -/* Fallback to plain, unoptimised C. */ - -typedef c_v256 v256; - -SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } -SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } -SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); } -SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } -SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } -SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { - return c_v256_from_v128(hi, lo); -} -SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { - return c_v256_from_64(a, b, c, d); -} -SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { - return c_v256_from_v64(a, b, c, d); -} - -SIMD_INLINE v256 v256_load_unaligned(const void *p) { - return c_v256_load_unaligned(p); -} -SIMD_INLINE v256 v256_load_aligned(const void *p) { - return c_v256_load_aligned(p); -} - -SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { - c_v256_store_unaligned(p, a); -} -SIMD_INLINE void v256_store_aligned(void *p, v256 a) { - c_v256_store_aligned(p, a); -} - -SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { - return c_v256_align(a, b, c); -} - -SIMD_INLINE v256 v256_zero() { return c_v256_zero(); } -SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } -SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } -SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } -SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); } - -typedef uint32_t sad256_internal; -SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); } -SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { - return c_v256_sad_u8(s, a, b); -} -SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { - return c_v256_sad_u8_sum(s); -} -typedef uint32_t ssd256_internal; -SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); } -SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { - return c_v256_ssd_u8(s, a, b); -} -SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { - return c_v256_ssd_u8_sum(s); -} - -SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { - return c_v256_dotp_su8(a, b); -} -SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { - return c_v256_dotp_s16(a, b); -} -SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { - return c_v256_dotp_s32(a, b); -} -SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } - -SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } -SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } -SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } -SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } - -SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } -SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } -SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); } -SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); } -SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } -SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } -SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); } -SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); } -SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); } -SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } -SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } -SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } -SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } -SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } -SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } -SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } -SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } -SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } -SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } - -SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } -SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { - return c_v256_mullo_s16(a, b); -} -SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { - return c_v256_mulhi_s16(a, b); -} -SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { - return c_v256_mullo_s32(a, b); -} -SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } -SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } - -SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); } -SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { - return c_v256_blend_8(a, b, c); -} - -SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } -SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } -SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { - return c_v256_rdavg_u16(a, b); -} -SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } -SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } -SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } -SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } -SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } -SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } -SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } -SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); } -SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); } - -SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } -SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } -SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } -SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } -SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } -SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } -SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } -SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } -SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { - return c_v256_ziplo_128(a, b); -} -SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { - return c_v256_ziphi_128(a, b); -} -SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } -SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } -SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } -SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { - return c_v256_unziplo_8(a, b); -} -SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { - return c_v256_unziphi_8(a, b); -} -SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { - return c_v256_unziplo_16(a, b); -} -SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { - return c_v256_unziphi_16(a, b); -} -SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { - return c_v256_unziplo_32(a, b); -} -SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { - return c_v256_unziphi_32(a, b); -} -SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { - return c_v256_unziplo_64(a, b); -} -SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { - return c_v256_unziphi_64(a, b); -} -SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } -SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { - return c_v256_unpacklo_u8_s16(a); -} -SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { - return c_v256_unpackhi_u8_s16(a); -} -SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } -SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { - return c_v256_unpacklo_s8_s16(a); -} -SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { - return c_v256_unpackhi_s8_s16(a); -} -SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { - return c_v256_pack_s32_s16(a, b); -} -SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { - return c_v256_pack_s32_u16(a, b); -} -SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { - return c_v256_pack_s16_u8(a, b); -} -SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { - return c_v256_pack_s16_s8(a, b); -} -SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { - return c_v256_unpack_u16_s32(a); -} -SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { - return c_v256_unpack_s16_s32(a); -} -SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { - return c_v256_unpacklo_u16_s32(a); -} -SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { - return c_v256_unpacklo_s16_s32(a); -} -SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { - return c_v256_unpackhi_u16_s32(a); -} -SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { - return c_v256_unpackhi_s16_s32(a); -} -SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { - return c_v256_shuffle_8(a, pattern); -} -SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { - return c_v256_wideshuffle_8(a, b, pattern); -} -SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { - return c_v256_pshuffle_8(a, pattern); -} - -SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } -SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } -SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } -SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { - return c_v256_cmpgt_s16(a, b); -} -SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { - return c_v256_cmplt_s16(a, b); -} -SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } -SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); } - -SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { - return c_v256_cmpgt_s32(a, b); -} -SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { - return c_v256_cmplt_s32(a, b); -} -SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { - return c_v256_shl_8(a, c); -} -SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { - return c_v256_shr_u8(a, c); -} -SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { - return c_v256_shr_s8(a, c); -} -SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { - return c_v256_shl_16(a, c); -} -SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { - return c_v256_shr_u16(a, c); -} -SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { - return c_v256_shr_s16(a, c); -} -SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { - return c_v256_shl_32(a, c); -} -SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { - return c_v256_shr_u32(a, c); -} -SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { - return c_v256_shr_s32(a, c); -} -SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { - return c_v256_shl_64(a, c); -} -SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { - return c_v256_shr_u64(a, c); -} -SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { - return c_v256_shr_s64(a, c); -} - -SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { - return c_v256_shr_n_byte(a, n); -} -SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { - return c_v256_shl_n_byte(a, n); -} -SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { - return c_v256_shl_n_8(a, n); -} -SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { - return c_v256_shl_n_16(a, n); -} -SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { - return c_v256_shl_n_32(a, n); -} -SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) { - return c_v256_shl_n_64(a, n); -} -SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { - return c_v256_shr_n_u8(a, n); -} -SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { - return c_v256_shr_n_u16(a, n); -} -SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { - return c_v256_shr_n_u32(a, n); -} -SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) { - return c_v256_shr_n_u64(a, n); -} -SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { - return c_v256_shr_n_s8(a, n); -} -SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { - return c_v256_shr_n_s16(a, n); -} -SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { - return c_v256_shr_n_s32(a, n); -} -SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) { - return c_v256_shr_n_s64(a, n); -} - -SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) { - return c_v256_shr_n_word(a, n); -} -SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) { - return c_v256_shl_n_word(a, n); -} - -typedef uint32_t sad256_internal_u16; -SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { - return c_v256_sad_u16_init(); -} -SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, - v256 b) { - return c_v256_sad_u16(s, a, b); -} -SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { - return c_v256_sad_u16_sum(s); -} - -typedef uint64_t ssd256_internal_s16; -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { - return c_v256_ssd_s16_init(); -} -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, - v256 b) { - return c_v256_ssd_s16(s, a, b); -} -SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { - return c_v256_ssd_s16_sum(s); -} - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h deleted file mode 100644 index bd86ea172..000000000 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ - -#include "aom_dsp/simd/v256_intrinsics_v128.h" - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h deleted file mode 100644 index a1c08e95a..000000000 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h +++ /dev/null @@ -1,953 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ - -#include <stdio.h> -#include <stdlib.h> - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v128_intrinsics_c.h" - -typedef union { - uint8_t u8[32]; - uint16_t u16[16]; - uint32_t u32[8]; - uint64_t u64[4]; - int8_t s8[32]; - int16_t s16[16]; - int32_t s32[8]; - int64_t s64[4]; - c_v64 v64[4]; - c_v128 v128[2]; -} c_v256; - -SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } - -SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } - -SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } - -SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } - -SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } - -SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { - c_v256 t; - t.v128[1] = hi; - t.v128[0] = lo; - return t; -} - -SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, - uint64_t d) { - c_v256 t; - t.u64[3] = a; - t.u64[2] = b; - t.u64[1] = c; - t.u64[0] = d; - return t; -} - -SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { - c_v256 t; - t.u64[3] = a.u64; - t.u64[2] = b.u64; - t.u64[1] = c.u64; - t.u64[0] = d.u64; - return t; -} - -SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { - c_v256 t; - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&t; - int c; - for (c = 0; c < 32; c++) q[c] = pp[c]; - return t; -} - -SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { - if (SIMD_CHECK && (uintptr_t)p & 31) { - fprintf(stderr, "Error: unaligned v256 load at %p\n", p); - abort(); - } - return c_v256_load_unaligned(p); -} - -SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&a; - int c; - for (c = 0; c < 32; c++) pp[c] = q[c]; -} - -SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { - if (SIMD_CHECK && (uintptr_t)p & 31) { - fprintf(stderr, "Error: unaligned v256 store at %p\n", p); - abort(); - } - c_v256_store_unaligned(p, a); -} - -SIMD_INLINE c_v256 c_v256_zero() { - c_v256 t; - t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; - return t; -} - -SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { - c_v256 t; - t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); - return t; -} - -SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { - c_v256 t; - t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); - return t; -} - -SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { - c_v256 t; - t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); - return t; -} - -SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { - c_v256 t; - t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; - return t; -} - -SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { - return c_v128_dotp_su8(a.v128[1], b.v128[1]) + - c_v128_dotp_su8(a.v128[0], b.v128[0]); -} - -SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { - return c_v128_dotp_s16(a.v128[1], b.v128[1]) + - c_v128_dotp_s16(a.v128[0], b.v128[0]); -} - -SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { - return c_v128_dotp_s32(a.v128[1], b.v128[1]) + - c_v128_dotp_s32(a.v128[0], b.v128[0]); -} - -SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { - return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); -} - -typedef uint32_t c_sad256_internal; - -SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - v256_sad_u8_sum(). - The result for more than 16 v256_sad_u8() calls is undefined. */ -SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, - c_v256 b) { - int c; - for (c = 0; c < 32; c++) - s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; - return s; -} - -SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; } - -typedef uint32_t c_ssd256_internal; - -SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_u8_sum(). */ -SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, - c_v256 b) { - int c; - for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); - return s; -} - -SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } - -SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), - c_v128_or(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), - c_v128_xor(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), - c_v128_and(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), - c_v128_andn(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), - c_v128_add_8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), - c_v128_add_16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), - c_v128_sadd_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), - c_v128_sadd_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), - c_v128_sadd_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), - c_v128_add_32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), - c_v128_add_64(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), - c_v128_sub_64(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { - c_v256 t; - for (int i = 0; i < 16; i++) - t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; - return t; -} - -SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { - c_v256 t; - t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; - t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; - t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; - t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; - t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; - t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; - t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; - t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; - return t; -} - -SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), - c_v128_sub_8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), - c_v128_ssub_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), - c_v128_ssub_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), - c_v128_sub_16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), - c_v128_ssub_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), - c_v128_ssub_u16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), - c_v128_sub_32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { - return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { - return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { - c_v128 lo_bits = c_v128_mullo_s16(a, b); - c_v128 hi_bits = c_v128_mulhi_s16(a, b); - return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), - c_v128_ziplo_16(hi_bits, lo_bits)); -} - -SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), - c_v128_mullo_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), - c_v128_mulhi_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), - c_v128_mullo_s32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), - c_v128_madd_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), - c_v128_madd_us8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), - c_v128_avg_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), - c_v128_rdavg_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), - c_v128_rdavg_u16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), - c_v128_avg_u16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), - c_v128_min_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), - c_v128_max_u8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), - c_v128_min_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { - return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | - ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | - ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | - ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | - ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | - ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | - ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | - ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | - ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | - ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | - ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | - ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | - ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | - ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | - ((a.s8[0] < 0) << 0); -} - -SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { - c_v256 t; - for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; - return t; -} - -SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), - c_v128_max_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), - c_v128_min_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), - c_v128_max_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), - c_v128_min_s32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), - c_v128_max_s32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), - c_v128_ziplo_8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), - c_v128_ziplo_8(a.v128[1], b.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), - c_v128_ziplo_16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), - c_v128_ziplo_16(a.v128[1], b.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), - c_v128_ziplo_32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), - c_v128_ziplo_32(a.v128[1], b.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), - c_v128_ziplo_64(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), - c_v128_ziplo_64(a.v128[1], b.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { - return c_v256_from_v128(a.v128[0], b.v128[0]); -} - -SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { - return c_v256_from_v128(a.v128[1], b.v128[1]); -} - -SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { - return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); -} - -SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { - return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); -} - -SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { - return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); -} - -SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { - c_v256 t; - int i; - if (mode) { - for (i = 0; i < 16; i++) { - t.u8[i] = a.u8[i * 2 + 1]; - t.u8[i + 16] = b.u8[i * 2 + 1]; - } - } else { - for (i = 0; i < 16; i++) { - t.u8[i] = b.u8[i * 2]; - t.u8[i + 16] = a.u8[i * 2]; - } - } - return t; -} - -SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) - : _c_v256_unzip_8(a, b, 0); -} - -SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) - : _c_v256_unzip_8(b, a, 1); -} - -SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { - c_v256 t; - int i; - if (mode) { - for (i = 0; i < 8; i++) { - t.u16[i] = a.u16[i * 2 + 1]; - t.u16[i + 8] = b.u16[i * 2 + 1]; - } - } else { - for (i = 0; i < 8; i++) { - t.u16[i] = b.u16[i * 2]; - t.u16[i + 8] = a.u16[i * 2]; - } - } - return t; -} - -SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) - : _c_v256_unzip_16(a, b, 0); -} - -SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) - : _c_v256_unzip_16(b, a, 1); -} - -SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { - c_v256 t; - if (mode) { - t.u32[7] = b.u32[7]; - t.u32[6] = b.u32[5]; - t.u32[5] = b.u32[3]; - t.u32[4] = b.u32[1]; - t.u32[3] = a.u32[7]; - t.u32[2] = a.u32[5]; - t.u32[1] = a.u32[3]; - t.u32[0] = a.u32[1]; - } else { - t.u32[7] = a.u32[6]; - t.u32[6] = a.u32[4]; - t.u32[5] = a.u32[2]; - t.u32[4] = a.u32[0]; - t.u32[3] = b.u32[6]; - t.u32[2] = b.u32[4]; - t.u32[1] = b.u32[2]; - t.u32[0] = b.u32[0]; - } - return t; -} - -SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) - : _c_v256_unzip_32(a, b, 0); -} - -SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) - : _c_v256_unzip_32(b, a, 1); -} - -SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { - c_v256 t; - if (mode) { - t.u64[3] = b.u64[3]; - t.u64[2] = b.u64[1]; - t.u64[1] = a.u64[3]; - t.u64[0] = a.u64[1]; - } else { - t.u64[3] = a.u64[2]; - t.u64[2] = a.u64[0]; - t.u64[1] = b.u64[2]; - t.u64[0] = b.u64[0]; - } - return t; -} - -SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) - : _c_v256_unzip_64(a, b, 0); -} - -SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { - return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) - : _c_v256_unzip_64(b, a, 1); -} - -SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { - return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); -} - -SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), - c_v128_unpacklo_u8_s16(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), - c_v128_unpacklo_u8_s16(a.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { - return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); -} - -SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), - c_v128_unpacklo_s8_s16(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), - c_v128_unpacklo_s8_s16(a.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), - c_v128_pack_s32_s16(b.v128[1], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), - c_v128_pack_s32_u16(b.v128[1], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), - c_v128_pack_s16_u8(b.v128[1], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), - c_v128_pack_s16_s8(b.v128[1], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { - return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), - c_v128_unpacklo_u16_s32(a)); -} - -SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { - return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), - c_v128_unpacklo_s16_s32(a)); -} - -SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), - c_v128_unpacklo_u16_s32(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), - c_v128_unpacklo_s16_s32(a.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), - c_v128_unpacklo_u16_s32(a.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { - return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), - c_v128_unpacklo_s16_s32(a.v128[1])); -} - -SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { - c_v256 t; - int c; - for (c = 0; c < 32; c++) - t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) - : pattern.u8[c] & 31]; - - return t; -} - -SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { - c_v256 t; - int c; - for (c = 0; c < 32; c++) - t.u8[c] = (pattern.u8[c] < 32 - ? b.u8 - : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) - : pattern.u8[c] & 31]; - return t; -} - -// Pairwise / dual-lane shuffle: shuffle two 128 bit lates. -SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { - return c_v256_from_v128( - c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), - c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); -} - -SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), - c_v128_cmpgt_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), - c_v128_cmplt_s8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), - c_v128_cmpeq_8(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), - c_v128_cmpgt_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), - c_v128_cmplt_s16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), - c_v128_cmpeq_16(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), - c_v128_cmpgt_s32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), - c_v128_cmplt_s32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { - return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), - c_v128_cmpeq_32(a.v128[0], b.v128[0])); -} - -SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { - if (n < 16) - return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), - c_v128_shr_n_byte(a.v128[0], 16 - n)), - c_v128_shl_n_byte(a.v128[0], n)); - else if (n > 16) - return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), - c_v128_zero()); - else - return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); -} - -SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { - if (n < 16) - return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), - c_v128_or(c_v128_shr_n_byte(a.v128[0], n), - c_v128_shl_n_byte(a.v128[1], 16 - n))); - else if (n > 16) - return c_v256_from_v128(c_v128_zero(), - c_v128_shr_n_byte(a.v128[1], n - 16)); - else - return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); -} - -SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { - if (SIMD_CHECK && c > 31) { - fprintf(stderr, "Error: undefined alignment %d\n", c); - abort(); - } - return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) - : b; -} - -SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), - c_v128_shl_8(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), - c_v128_shr_u8(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), - c_v128_shr_s8(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), - c_v128_shl_16(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), - c_v128_shr_u16(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), - c_v128_shr_s16(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), - c_v128_shl_32(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), - c_v128_shr_u32(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { - return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), - c_v128_shr_s32(a.v128[0], c)); -} - -SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { - c_v256 t; - if (SIMD_CHECK && n > 63) { - fprintf(stderr, "Error: undefined s64 shift right %d\n", n); - abort(); - } - t.s64[3] = a.s64[3] >> n; - t.s64[2] = a.s64[2] >> n; - t.s64[1] = a.s64[1] >> n; - t.s64[0] = a.s64[0] >> n; - return t; -} - -SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { - c_v256 t; - if (SIMD_CHECK && n > 63) { - fprintf(stderr, "Error: undefined s64 shift right %d\n", n); - abort(); - } - t.u64[3] = a.u64[3] >> n; - t.u64[2] = a.u64[2] >> n; - t.u64[1] = a.u64[1] >> n; - t.u64[0] = a.u64[0] >> n; - return t; -} - -SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { - c_v256 t; - if (SIMD_CHECK && n > 63) { - fprintf(stderr, "Error: undefined s64 shift right %d\n", n); - abort(); - } - t.u64[3] = a.u64[3] << n; - t.u64[2] = a.u64[2] << n; - t.u64[1] = a.u64[1] << n; - t.u64[0] = a.u64[0] << n; - return t; -} - -SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { - return c_v256_shl_8(a, n); -} - -SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { - return c_v256_shl_16(a, n); -} - -SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { - return c_v256_shl_32(a, n); -} - -SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { - return c_v256_shl_64(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { - return c_v256_shr_u8(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { - return c_v256_shr_u16(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { - return c_v256_shr_u32(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { - return c_v256_shr_u64(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { - return c_v256_shr_s8(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { - return c_v256_shr_s16(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { - return c_v256_shr_s32(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { - return c_v256_shr_s64(a, n); -} - -SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { - return c_v256_shr_n_byte(a, 2 * n); -} -SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { - return c_v256_shl_n_byte(a, 2 * n); -} - -typedef uint32_t c_sad256_internal_u16; - -SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - v256_sad_u16_sum(). */ -SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, - c_v256 a, c_v256 b) { - int c; - for (c = 0; c < 16; c++) - s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; - return s; -} - -SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } - -typedef uint64_t c_ssd256_internal_s16; - -SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_s16_sum(). */ -SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, - c_v256 a, c_v256 b) { - int c; - for (c = 0; c < 16; c++) - s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * - (int32_t)(int16_t)(a.s16[c] - b.s16[c]); - return s; -} - -SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h deleted file mode 100644 index d5b7905ef..000000000 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h +++ /dev/null @@ -1,873 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ - -#if HAVE_NEON -#include "aom_dsp/simd/v128_intrinsics_arm.h" -#elif HAVE_SSE2 -#include "aom_dsp/simd/v128_intrinsics_x86.h" -#else -#include "aom_dsp/simd/v128_intrinsics.h" -#endif - -#if HAVE_NEON -typedef int64x2x2_t v256; -#else -typedef struct { - v128 val[2]; -} v256; -#endif - -SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } - -SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } - -SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } - -SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } - -SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } - -SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { - v256 t; - t.val[1] = hi; - t.val[0] = lo; - return t; -} - -SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { - return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); -} - -SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { - return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); -} - -SIMD_INLINE v256 v256_load_unaligned(const void *p) { - return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), - v128_load_unaligned(p)); -} - -SIMD_INLINE v256 v256_load_aligned(const void *p) { - return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), - v128_load_aligned(p)); -} - -SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { - v128_store_unaligned(p, a.val[0]); - v128_store_unaligned((uint8_t *)p + 16, a.val[1]); -} - -SIMD_INLINE void v256_store_aligned(void *p, v256 a) { - v128_store_aligned(p, a.val[0]); - v128_store_aligned((uint8_t *)p + 16, a.val[1]); -} - -SIMD_INLINE v256 v256_zero() { - return v256_from_v128(v128_zero(), v128_zero()); -} - -SIMD_INLINE v256 v256_dup_8(uint8_t x) { - v128 t = v128_dup_8(x); - return v256_from_v128(t, t); -} - -SIMD_INLINE v256 v256_dup_16(uint16_t x) { - v128 t = v128_dup_16(x); - return v256_from_v128(t, t); -} - -SIMD_INLINE v256 v256_dup_32(uint32_t x) { - v128 t = v128_dup_32(x); - return v256_from_v128(t, t); -} - -SIMD_INLINE v256 v256_dup_64(uint64_t x) { - v128 t = v128_dup_64(x); - return v256_from_v128(t, t); -} - -SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { - return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); -} - -SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { - return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); -} - -SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { - return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); -} - -SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { - return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); -} - -typedef struct { - sad128_internal val[2]; -} sad256_internal; - -SIMD_INLINE sad256_internal v256_sad_u8_init() { - sad256_internal t; - t.val[1] = v128_sad_u8_init(); - t.val[0] = v128_sad_u8_init(); - return t; -} - -/* Implementation dependent return value. Result must be finalised with - v256_sad_u8_sum(). - The result for more than 16 v256_sad_u8() calls is undefined. */ -SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { - sad256_internal t; - t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); - t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); - return t; -} - -SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { - return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); -} - -typedef struct { - ssd128_internal val[2]; -} ssd256_internal; - -SIMD_INLINE ssd256_internal v256_ssd_u8_init() { - ssd256_internal t; - t.val[1] = v128_ssd_u8_init(); - t.val[0] = v128_ssd_u8_init(); - return t; -} - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_u8_sum(). */ -SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { - ssd256_internal t; - t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); - t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); - return t; -} - -SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { - return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); -} - -SIMD_INLINE v256 v256_or(v256 a, v256 b) { - return v256_from_v128(v128_or(a.val[1], b.val[1]), - v128_or(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_xor(v256 a, v256 b) { - return v256_from_v128(v128_xor(a.val[1], b.val[1]), - v128_xor(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_and(v256 a, v256 b) { - return v256_from_v128(v128_and(a.val[1], b.val[1]), - v128_and(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_andn(v256 a, v256 b) { - return v256_from_v128(v128_andn(a.val[1], b.val[1]), - v128_andn(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { - return v256_from_v128(v128_add_8(a.val[1], b.val[1]), - v128_add_8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { - return v256_from_v128(v128_add_16(a.val[1], b.val[1]), - v128_add_16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { - return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), - v128_sadd_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { - return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), - v128_sadd_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { - return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), - v128_sadd_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { - return v256_from_v128(v128_add_32(a.val[1], b.val[1]), - v128_add_32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { - return v256_from_v128(v128_add_64(a.val[1], b.val[1]), - v128_add_64(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_padd_u8(v256 a) { - return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); -} - -SIMD_INLINE v256 v256_padd_s16(v256 a) { - return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); -} - -SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { - return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), - v128_sub_8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { - return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), - v128_ssub_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { - return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), - v128_ssub_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { - return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), - v128_sub_16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { - return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), - v128_ssub_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { - return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), - v128_ssub_u16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { - return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), - v128_sub_32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { - return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), - v128_sub_64(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_abs_s16(v256 a) { - return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); -} - -SIMD_INLINE v256 v256_abs_s8(v256 a) { - return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); -} - -SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { - v128 lo_bits = v128_mullo_s16(a, b); - v128 hi_bits = v128_mulhi_s16(a, b); - return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), - v128_ziplo_16(hi_bits, lo_bits)); -} - -SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { - return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), - v128_mullo_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { - return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), - v128_mulhi_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { - return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), - v128_mullo_s32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { - return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), - v128_madd_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { - return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), - v128_madd_us8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { - return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), - v128_avg_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { - return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), - v128_rdavg_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { - return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), - v128_rdavg_u16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { - return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), - v128_avg_u16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { - return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), - v128_min_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { - return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), - v128_max_u8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { - return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), - v128_min_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE uint32_t v256_movemask_8(v256 a) { - return (v128_movemask_8(v256_high_v128(a)) << 16) | - v128_movemask_8(v256_low_v128(a)); -} - -SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { - return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), - v128_blend_8(a.val[0], b.val[0], c.val[0])); -} - -SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { - return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), - v128_max_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { - return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), - v128_min_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { - return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), - v128_max_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { - return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), - v128_min_s32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { - return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), - v128_max_s32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), - v128_ziplo_8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), - v128_ziplo_8(a.val[1], b.val[1])); -} - -SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), - v128_ziplo_16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), - v128_ziplo_16(a.val[1], b.val[1])); -} - -SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), - v128_ziplo_32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), - v128_ziplo_32(a.val[1], b.val[1])); -} - -SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), - v128_ziplo_64(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), - v128_ziplo_64(a.val[1], b.val[1])); -} - -SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { - return v256_from_v128(a.val[0], b.val[0]); -} - -SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { - return v256_from_v128(a.val[1], b.val[1]); -} - -SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); -} - -SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); -} - -SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); -} - -SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), - v128_unziplo_8(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), - v128_unziphi_8(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), - v128_unziplo_16(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), - v128_unziphi_16(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), - v128_unziplo_32(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), - v128_unziphi_32(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { -#if HAVE_SSE2 - return v256_from_v128( - _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), - _mm_castsi128_pd(a.val[1]), 0)), - _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), - _mm_castsi128_pd(b.val[1]), 0))); -#else - return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), - v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); -#endif -} - -SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { -#if HAVE_SSE2 - return v256_from_v128( - _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), - _mm_castsi128_pd(a.val[1]), 3)), - _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), - _mm_castsi128_pd(b.val[1]), 3))); -#else - return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), - v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); -#endif -} - -SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); -} - -SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), - v128_unpacklo_u8_s16(a.val[0])); -} - -SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), - v128_unpacklo_u8_s16(a.val[1])); -} - -SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); -} - -SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), - v128_unpacklo_s8_s16(a.val[0])); -} - -SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), - v128_unpacklo_s8_s16(a.val[1])); -} - -SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { - return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), - v128_pack_s32_s16(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { - return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), - v128_pack_s32_u16(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), - v128_pack_s16_u8(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), - v128_pack_s16_s8(b.val[1], b.val[0])); -} - -SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); -} - -SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); -} - -SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), - v128_unpacklo_u16_s32(a.val[0])); -} - -SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), - v128_unpacklo_s16_s32(a.val[0])); -} - -SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), - v128_unpacklo_u16_s32(a.val[1])); -} - -SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), - v128_unpacklo_s16_s32(a.val[1])); -} - -SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { - return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), - v128_cmpgt_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { - return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), - v128_cmplt_s8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { - return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), - v128_cmpeq_8(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { - return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), - v128_cmpgt_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { - return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), - v128_cmplt_s16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { - return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), - v128_cmpeq_16(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { - return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), - v128_cmpgt_s32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { - return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), - v128_cmplt_s32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { - return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), - v128_cmpeq_32(a.val[0], b.val[0])); -} - -SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { -#if HAVE_NEON -#if defined(__aarch64__) - uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]) } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - return v256_from_64( - (uint64_t)vreinterpret_s64_u8( - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), - (uint64_t)vreinterpret_s64_u8( - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), - (uint64_t)vreinterpret_s64_u8( - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), - (uint64_t)vreinterpret_s64_u8( - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); -#endif -#else - v128 c16 = v128_dup_8(16); - v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); - v128 masklo = v128_cmplt_s8(pattern.val[0], c16); - return v256_from_v128( - v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), - v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), - v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), - v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); -#endif -} - -SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { -#if HAVE_NEON -#if defined(__aarch64__) - uint8x16x4_t p = { { - vreinterpretq_u8_s64(y.val[0]), - vreinterpretq_u8_s64(y.val[1]), - vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]), - } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - v256 c32 = v256_dup_8(32); - v256 p32 = v256_sub_8(pattern, c32); - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])), - vget_high_u8(vreinterpretq_u8_s64(y.val[0])), - vget_low_u8(vreinterpretq_u8_s64(y.val[1])), - vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } }; - v256 r1 = - v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( - p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - p, vreinterpret_u8_s64(vget_low_s64(p32.val[0]))))); - v256 r2 = - v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( - q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), - (uint64_t)vreinterpret_s64_u8(vtbl4_u8( - q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); - return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32)); -#endif -#else - v128 c16 = v128_dup_8(16); - v128 c32 = v128_dup_8(32); - v128 c48 = v128_dup_8(48); - v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); - v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); - v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); - v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); - v256 r1 = v256_from_v128( - v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), - v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), - maskhi48), - v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), - v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), - masklo48)); - v256 r2 = v256_from_v128( - v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), - v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), - v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), - v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); - return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); -#endif -} - -SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { - return v256_from_v128( - v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), - v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); -} - -SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { - return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { - return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { - return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { - return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); -} - -SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { - return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); -} - -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define v256_shl_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ - v128_shr_n_byte(a.val[0], 16 - (n))), \ - v128_shl_n_byte(a.val[0], (n))) \ - : v256_from_v128( \ - (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ - v128_zero())) - -#define v256_shr_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ - v128_or(v128_shr_n_byte(a.val[0], n), \ - v128_shl_n_byte(a.val[1], 16 - (n)))) \ - : v256_from_v128( \ - v128_zero(), \ - (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])) - -#define v256_align(a, b, c) \ - ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) - -#define v256_shl_n_8(a, n) \ - v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) -#define v256_shl_n_16(a, n) \ - v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) -#define v256_shl_n_32(a, n) \ - v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) -#define v256_shl_n_64(a, n) \ - v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) -#define v256_shr_n_u8(a, n) \ - v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) -#define v256_shr_n_u16(a, n) \ - v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) -#define v256_shr_n_u32(a, n) \ - v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) -#define v256_shr_n_u64(a, n) \ - v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) -#define v256_shr_n_s8(a, n) \ - v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) -#define v256_shr_n_s16(a, n) \ - v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) -#define v256_shr_n_s32(a, n) \ - v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) -#define v256_shr_n_s64(a, n) \ - v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) - -#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) -#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) - -typedef struct { - sad128_internal_u16 val[2]; -} sad256_internal_u16; - -SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { - sad256_internal_u16 t; - t.val[1] = v128_sad_u16_init(); - t.val[0] = v128_sad_u16_init(); - return t; -} - -/* Implementation dependent return value. Result must be finalised with - v256_sad_u16_sum(). - The result for more than 16 v256_sad_u16() calls is undefined. */ -SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, - v256 b) { - sad256_internal_u16 t; - t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); - t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); - return t; -} - -SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { - return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); -} - -typedef struct { - ssd128_internal_s16 val[2]; -} ssd256_internal_s16; - -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { - ssd256_internal_s16 t; - t.val[1] = v128_ssd_s16_init(); - t.val[0] = v128_ssd_s16_init(); - return t; -} - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_s16_sum(). */ -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, - v256 b) { - ssd256_internal_s16 t; - t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); - t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); - return t; -} - -SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { - return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); -} - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h deleted file mode 100644 index 44594bc41..000000000 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h +++ /dev/null @@ -1,750 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ - -#if !defined(__AVX2__) - -#include "aom_dsp/simd/v256_intrinsics_v128.h" - -#else - -// The _m256i type seems to cause problems for g++'s mangling prior to -// version 5, but adding -fabi-version=0 fixes this. -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ - defined(__AVX2__) && defined(__cplusplus) -#pragma GCC optimize "-fabi-version=0" -#endif - -#include <immintrin.h> - -#include "aom_dsp/simd/v128_intrinsics_x86.h" - -typedef __m256i v256; - -SIMD_INLINE uint32_t v256_low_u32(v256 a) { - return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); -} - -SIMD_INLINE v64 v256_low_v64(v256 a) { - return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); -} - -SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } - -SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } - -SIMD_INLINE v128 v256_high_v128(v256 a) { - return _mm256_extracti128_si256(a, 1); -} - -SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { - // gcc seems to be missing _mm256_set_m128i() - return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); -} - -SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { - return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); -} - -SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { - return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); -} - -SIMD_INLINE v256 v256_load_aligned(const void *p) { - return _mm256_load_si256((const __m256i *)p); -} - -SIMD_INLINE v256 v256_load_unaligned(const void *p) { - return _mm256_loadu_si256((const __m256i *)p); -} - -SIMD_INLINE void v256_store_aligned(void *p, v256 a) { - _mm256_store_si256((__m256i *)p, a); -} - -SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { - _mm256_storeu_si256((__m256i *)p, a); -} - -SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); } - -SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); } - -SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } - -SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } - -SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); } - -SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } - -SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } - -SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } - -SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } - -SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { - return _mm256_adds_epi16(a, b); -} - -SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } - -SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } - -SIMD_INLINE v256 v256_padd_u8(v256 a) { - return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); -} - -SIMD_INLINE v256 v256_padd_s16(v256 a) { - return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); -} - -SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } - -SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } - -SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } - -SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } - -SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { - return _mm256_subs_epi16(a, b); -} - -SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { - return _mm256_subs_epu16(a, b); -} - -SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } - -SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } - -SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } - -SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } - -// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit -// lanes of lower or upper halves of a 256bit vector because the -// unpack/pack intrinsics operate on the 256 bit input vector as 2 -// independent 128 bit vectors. -SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { - return _mm256_unpacklo_epi8( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { - return _mm256_unpackhi_epi8( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { - return _mm256_unpacklo_epi16( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { - return _mm256_unpackhi_epi16( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { - return _mm256_unpacklo_epi32( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { - return _mm256_unpackhi_epi32( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { - return _mm256_unpacklo_epi64( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { - return _mm256_unpackhi_epi64( - _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); -} - -SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { - return v256_from_v128(v256_low_v128(a), v256_low_v128(b)); -} - -SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { - return v256_from_v128(v256_high_v128(a), v256_high_v128(b)); -} - -SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); -} - -SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); -} - -SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { - return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); -} - -SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { - return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); -} - -SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { - return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); -} - -SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), - _mm256_castsi256_ps(a), - _MM_SHUFFLE(3, 1, 3, 1))), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), - _mm256_castsi256_ps(a), - _MM_SHUFFLE(2, 0, 2, 0))), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), - _mm256_castsi256_pd(a), 15)), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { - return _mm256_permute4x64_epi64( - _mm256_castpd_si256( - _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); -} - -SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { - return _mm256_unpacklo_epi8( - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_setzero_si256()); -} - -SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { - return _mm256_unpackhi_epi8( - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_setzero_si256()); -} - -SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); -} - -SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { - return _mm256_srai_epi16( - _mm256_unpacklo_epi8( - a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), - 8); -} - -SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { - return _mm256_srai_epi16( - _mm256_unpackhi_epi8( - a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), - 8); -} - -SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { - return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { - return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { - return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { - return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), - _MM_SHUFFLE(3, 1, 2, 0)); -} - -SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); -} - -SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); -} - -SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { - return _mm256_unpacklo_epi16( - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_setzero_si256()); -} - -SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { - return _mm256_srai_epi32( - _mm256_unpacklo_epi16( - a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), - 16); -} - -SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { - return _mm256_unpackhi_epi16( - _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), - _mm256_setzero_si256()); -} - -SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { - return _mm256_srai_epi32( - _mm256_unpackhi_epi16( - a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), - 16); -} - -SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { - return _mm256_blendv_epi8( - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), - _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); -} - -SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { - v256 c32 = v256_dup_8(32); - v256 p32 = v256_sub_8(pattern, c32); - v256 r1 = _mm256_blendv_epi8( - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), - _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); - v256 r2 = _mm256_blendv_epi8( - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), - _mm256_shuffle_epi8( - _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), - _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); - return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); -} - -SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { - return _mm256_shuffle_epi8(a, pattern); -} - -SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { - v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); - v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); - t1 = _mm256_add_epi32(t1, t2); - v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), - _mm256_extracti128_si256(t1, 1)); - t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); - t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); - return (int32_t)v128_low_u32(t); -} - -SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { - v256 r = _mm256_madd_epi16(a, b); -#if defined(__x86_64__) - v128 t; - r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), - _mm256_cvtepi32_epi64(v256_low_v128(r))); - t = v256_low_v128(_mm256_add_epi64( - r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); - return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); -#else - v128 l = v256_low_v128(r); - v128 h = v256_high_v128(r); - return (int64_t)_mm_cvtsi128_si32(l) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + - (int64_t)_mm_cvtsi128_si32(h) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); -#endif -} - -SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { - v256 r = _mm256_mullo_epi32(a, b); -#if defined(__x86_64__) - v128 t; - r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), - _mm256_cvtepi32_epi64(v256_low_v128(r))); - t = v256_low_v128(_mm256_add_epi64( - r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); - return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); -#else - v128 l = v256_low_v128(r); - v128 h = v256_high_v128(r); - return (int64_t)_mm_cvtsi128_si32(l) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + - (int64_t)_mm_cvtsi128_si32(h) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + - (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); -#endif -} - -SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { - v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); - v128 lo = v256_low_v128(t); - v128 hi = v256_high_v128(t); - lo = v128_add_32(lo, hi); - return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); -} - -typedef v256 sad256_internal; - -SIMD_INLINE sad256_internal v256_sad_u8_init() { - return _mm256_setzero_si256(); -} - -/* Implementation dependent return value. Result must be finalised with - v256_sad_u8_sum(). - The result for more than 32 v256_sad_u8() calls is undefined. */ -SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { - return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); -} - -SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { - v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); - return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); -} - -typedef v256 ssd256_internal; - -SIMD_INLINE ssd256_internal v256_ssd_u8_init() { - return _mm256_setzero_si256(); -} - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_u8_sum(). */ -SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { - v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), - _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); - v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), - _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); - v256 rl = _mm256_madd_epi16(l, l); - v256 rh = _mm256_madd_epi16(h, h); - v128 c = _mm_cvtsi32_si128(32); - rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); - rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); - rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); - rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); - return _mm256_add_epi64( - s, - _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); -} - -SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { - v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); - return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); -} - -SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } - -SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } - -SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } - -SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } - -SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { - v128 lo_bits = v128_mullo_s16(a, b); - v128 hi_bits = v128_mulhi_s16(a, b); - return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), - v128_ziplo_16(hi_bits, lo_bits)); -} - -SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { - return _mm256_mullo_epi16(a, b); -} - -SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { - return _mm256_mulhi_epi16(a, b); -} - -SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { - return _mm256_mullo_epi32(a, b); -} - -SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { - return _mm256_madd_epi16(a, b); -} - -SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { - return _mm256_maddubs_epi16(a, b); -} - -SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } - -SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { - return _mm256_sub_epi8( - _mm256_avg_epu8(a, b), - _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); -} - -SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { - return _mm256_sub_epi16( - _mm256_avg_epu16(a, b), - _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); -} - -SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } - -SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } - -SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } - -SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } - -SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); } - -SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { - return _mm256_blendv_epi8(a, b, c); -} - -SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } - -SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } - -SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } - -SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } - -SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } - -SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { - return _mm256_cmpgt_epi8(a, b); -} - -SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { - return _mm256_cmpgt_epi8(b, a); -} - -SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { - return _mm256_cmpeq_epi8(a, b); -} - -SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { - return _mm256_cmpgt_epi16(a, b); -} - -SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { - return _mm256_cmpgt_epi16(b, a); -} - -SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { - return _mm256_cmpeq_epi16(a, b); -} - -SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { - return _mm256_cmpgt_epi32(a, b); -} - -SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { - return _mm256_cmpgt_epi32(b, a); -} - -SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { - return _mm256_cmpeq_epi32(a, b); -} - -SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { - return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), - _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { - return _mm256_and_si256(_mm256_set1_epi8(0xff >> c), - _mm256_srl_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { - __m128i x = _mm_cvtsi32_si128(c + 8); - return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), - _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); -} - -SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { - return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { - return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { - return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { - return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { - return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { - return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { - return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { - return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { -#if defined(__AVX512F__) - return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c)); -#else - return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), - v128_shr_s64(v256_low_v128(a), c)); -#endif -} - -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -// _mm256_slli_si256 works on 128 bit lanes and can't be used -#define v256_shl_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128( \ - v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ - v128_shl_n_byte(v256_low_v128(a), n)) \ - : _mm256_inserti128_si256( \ - _mm256_setzero_si256(), \ - v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) - -// _mm256_srli_si256 works on 128 bit lanes and can't be used -#define v256_shr_n_byte(a, n) \ - ((n) < 16 \ - ? _mm256_alignr_epi8( \ - _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ - : _mm256_inserti128_si256( \ - _mm256_setzero_si256(), \ - v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)) - -// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used -#define v256_align(a, b, c) \ - ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b) - -#define v256_shl_n_8(a, c) \ - _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \ - _mm256_slli_epi16(a, c)) -#define v256_shr_n_u8(a, c) \ - _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c)) -#define v256_shr_n_s8(a, c) \ - _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ - _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) -#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) -#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) -#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) -#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) -#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) -#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) -#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) -#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) -#define v256_shr_n_s64(a, c) \ - v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? -#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) -#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) - -typedef v256 sad256_internal_u16; - -SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v256_sad_u16_sum(). */ -SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, - v256 b) { -#if defined(__SSE4_1__) - v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); -#else - v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), - v256_xor(b, v256_dup_16(32768))); - t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), - v256_or(v256_and(a, t), v256_andn(b, t))); -#endif - return v256_add_32( - s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); -} - -SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { - v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); - return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + - v128_low_u32(v128_shr_n_byte(t, 8)) + - v128_low_u32(v128_shr_n_byte(t, 12)); -} - -typedef v256 ssd256_internal_s16; - -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v256_ssd_s16_sum(). */ -SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, - v256 b) { - v256 d = v256_sub_16(a, b); - d = v256_madd_s16(d, d); - return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), - _mm256_unpacklo_epi32(d, v256_zero()))); -} - -SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { - v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); - return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); -} - -#endif - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h deleted file mode 100644 index afc55428d..000000000 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ - -#include <stdio.h> -#include <stdlib.h> - -#include "aom_dsp/simd/v64_intrinsics_c.h" - -/* Fallback to plain, unoptimised C. */ - -typedef c_v64 v64; - -SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } -SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } -SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } -SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } -SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { - return c_v64_from_32(x, y); -} -SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } -SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } -SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { - return c_v64_from_16(a, b, c, d); -} - -SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { - return c_u32_load_unaligned(p); -} -SIMD_INLINE uint32_t u32_load_aligned(const void *p) { - return c_u32_load_aligned(p); -} -SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { - c_u32_store_unaligned(p, a); -} -SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { - c_u32_store_aligned(p, a); -} - -SIMD_INLINE v64 v64_load_unaligned(const void *p) { - return c_v64_load_unaligned(p); -} -SIMD_INLINE v64 v64_load_aligned(const void *p) { - return c_v64_load_aligned(p); -} - -SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { - c_v64_store_unaligned(p, a); -} -SIMD_INLINE void v64_store_aligned(void *p, v64 a) { - c_v64_store_aligned(p, a); -} - -SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { - return c_v64_align(a, b, c); -} - -SIMD_INLINE v64 v64_zero() { return c_v64_zero(); } -SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } -SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } -SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } - -SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } -SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } -SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); } -SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); } -SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } -SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } -SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } -SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } -SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } -SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } -SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } -SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } -SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } -SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } -SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } - -SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } -SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } -SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } -SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } -SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } -SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } -SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } -SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } -SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } -SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } -SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } -SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } -SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } -SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } -SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { - return c_v64_pack_s32_s16(a, b); -} -SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { - return c_v64_pack_s32_u16(a, b); -} -SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { - return c_v64_pack_s16_u8(a, b); -} -SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { - return c_v64_pack_s16_s8(a, b); -} -SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { - return c_v64_unpacklo_u16_s32(a); -} -SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { - return c_v64_unpacklo_s16_s32(a); -} -SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { - return c_v64_unpackhi_u16_s32(a); -} -SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { - return c_v64_unpackhi_s16_s32(a); -} -SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { - return c_v64_shuffle_8(a, pattern); -} - -typedef uint32_t sad64_internal; -SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); } -SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { - return c_v64_sad_u8(s, a, b); -} -SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { - return c_v64_sad_u8_sum(s); -} -typedef uint32_t ssd64_internal; -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); } -SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { - return c_v64_ssd_u8(s, a, b); -} -SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { - return c_v64_ssd_u8_sum(s); -} -SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } -SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } -SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } -SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } - -SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } -SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } -SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } -SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } - -SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } -SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } -SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } -SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } -SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } - -SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } -SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } -SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); } -SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } -SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } -SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } -SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } -SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } -SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } -SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } - -SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } -SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } -SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } -SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } -SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } -SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } - -SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } -SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } -SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } -SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } -SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { - return c_v64_shr_u16(a, n); -} -SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { - return c_v64_shr_s16(a, n); -} -SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } -SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { - return c_v64_shr_u32(a, n); -} -SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { - return c_v64_shr_s32(a, n); -} -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { - return c_v64_shr_n_byte(a, n); -} -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { - return c_v64_shl_n_byte(a, n); -} -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { - return c_v64_shl_n_8(a, c); -} -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { - return c_v64_shr_n_u8(a, c); -} -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { - return c_v64_shr_n_s8(a, c); -} -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { - return c_v64_shl_n_16(a, c); -} -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return c_v64_shr_n_u16(a, c); -} -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return c_v64_shr_n_s16(a, c); -} -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { - return c_v64_shl_n_32(a, c); -} -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return c_v64_shr_n_u32(a, c); -} -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return c_v64_shr_n_s32(a, c); -} - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h deleted file mode 100644 index 8f39ad6e8..000000000 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h +++ /dev/null @@ -1,680 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ - -#include <arm_neon.h> - -#include "aom_dsp/simd/v64_intrinsics_arm.h" -#include "aom_ports/arm.h" - -#ifdef AOM_INCOMPATIBLE_GCC -#error Incompatible gcc -#endif - -typedef int64x1_t v64; - -SIMD_INLINE uint32_t v64_low_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 0); -} - -SIMD_INLINE uint32_t v64_high_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 1); -} - -SIMD_INLINE int32_t v64_low_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 0); -} - -SIMD_INLINE int32_t v64_high_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 1); -} - -SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { - return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | - d); -} - -SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { - return vcreate_s64((uint64_t)x << 32 | y); -} - -SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } - -SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } - -SIMD_INLINE uint32_t u32_load_aligned(const void *p) { - return *((uint32_t *)p); -} - -SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { - return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); -} - -SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { - *((uint32_t *)p) = a; -} - -SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { -#if defined(__clang__) - vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), - 0); -#elif defined(__CC_ARM) - *(__packed uint32_t *)p) = a; -#elif defined(__GNUC__) - *((__attribute((packed)) uint32_t *)p) = a; -#else - vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), - 0); -#endif -} - -SIMD_INLINE v64 v64_load_aligned(const void *p) { - return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); -} - -SIMD_INLINE v64 v64_load_unaligned(const void *p) { - return v64_load_aligned(p); -} - -SIMD_INLINE void v64_store_aligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -// The following function requires an immediate. -// Some compilers will check this if it's optimising, others wont. -SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpret_s64_s8( - vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) - : b; -#else - return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)) - : b; -#endif -} - -SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); } - -SIMD_INLINE v64 v64_dup_8(uint8_t x) { - return vreinterpret_s64_u8(vdup_n_u8(x)); -} - -SIMD_INLINE v64 v64_dup_16(uint16_t x) { - return vreinterpret_s64_u16(vdup_n_u16(x)); -} - -SIMD_INLINE v64 v64_dup_32(uint32_t x) { - return vreinterpret_s64_u32(vdup_n_u32(x)); -} - -SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); -#if defined(__aarch64__) - return vaddlvq_s16(t); -#else - int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); - return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); -#endif -} - -SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { -#if defined(__aarch64__) - return vaddlvq_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -#else - int64x2_t r = - vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); -#endif -} - -SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { -#if defined(__aarch64__) - return vaddlv_u8(vreinterpret_u8_s64(x)); -#else - return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); -#endif -} - -SIMD_INLINE int64_t v64_hadd_s16(v64 a) { - return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); -} - -typedef uint16x8_t sad64_internal; - -SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_sad_u8_sum(). -SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { - return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); -} - -SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { -#if defined(__aarch64__) - return vaddlvq_u16(s); -#else - uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); - return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); -#endif -} - -typedef uint32x4_t ssd64_internal; - -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_ssd_u8_sum(). -SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { - uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); - return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); -} - -SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { -#if defined(__aarch64__) - return vaddvq_u32(s); -#else - uint64x2_t t = vpaddlq_u32(s); - return vget_lane_u32( - vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } - -SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } - -SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } - -SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } - -SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { - return vreinterpret_s64_u32( - vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); -} - -SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_abs_s16(v64 x) { - return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); -} - -SIMD_INLINE v64 v64_abs_s8(v64 x) { - return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); -} - -SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { -#if defined(__aarch64__) - int16x8_t t = vreinterpretq_s16_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); -#else - return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); -#endif -} - -SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { - int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); - return vreinterpret_s64_s32( - vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), - vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); -} - -SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), - vmovl_s8(vreinterpret_s8_s64(y))); - return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); -} - -SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u8( - vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u8( - vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u16( - vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u16( - vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u32( - vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u32( - vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { - return vreinterpret_s64_s16(vqmovn_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { - return vreinterpret_s64_u16(vqmovun_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { - return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { - return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u8( - vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u8( - vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u16( - vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { -#if defined(__aarch64__) - return vreinterpret_s64_u16( - vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { - return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { - return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { - return vreinterpret_s64_s32( - vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { - return vreinterpret_s64_u32( - vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { - return vreinterpret_s64_u8( - vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); -} - -SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { - return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c))); -} - -SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { - return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c))); -} - -SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { - return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c))); -} - -SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { - return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c))); -} - -SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { - return vreinterpret_s64_u16( - vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c))); -} - -SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { - return vreinterpret_s64_s16( - vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c))); -} - -SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { - return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c))); -} - -SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { - return vreinterpret_s64_u32( - vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c))); -} - -SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { - return vreinterpret_s64_s32( - vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); -} - -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return vshl_n_s64(a, c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { - return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { - return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { - return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)); -} - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { - return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)); -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { - return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)); -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)); -} - -#else - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) << c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) >> c * 8); -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return v64_shr_u16(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return v64_shr_s16(a, c); -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return v64_shr_u32(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return v64_shr_s32(a, c); -} - -#endif - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h deleted file mode 100644 index 028d68c4f..000000000 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h +++ /dev/null @@ -1,968 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ - -/* Note: This implements the intrinsics in plain, unoptimised C. - Intended for reference, porting or debugging. */ - -#include <stdio.h> -#include <stdlib.h> - -#include "config/aom_config.h" - -typedef union { - uint8_t u8[8]; - uint16_t u16[4]; - uint32_t u32[2]; - uint64_t u64; - int8_t s8[8]; - int16_t s16[4]; - int32_t s32[2]; - int64_t s64; -} c_v64; - -SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { - return a.u32[!!CONFIG_BIG_ENDIAN]; -} - -SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { - return a.u32[!CONFIG_BIG_ENDIAN]; -} - -SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { - return a.s32[!!CONFIG_BIG_ENDIAN]; -} - -SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { - return a.s32[!CONFIG_BIG_ENDIAN]; -} - -SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { - c_v64 t; - t.u32[!CONFIG_BIG_ENDIAN] = x; - t.u32[!!CONFIG_BIG_ENDIAN] = y; - return t; -} - -SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { - c_v64 t; - t.u64 = x; - return t; -} - -SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } - -SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, - uint16_t d) { - c_v64 t; - if (CONFIG_BIG_ENDIAN) { - t.u16[0] = a; - t.u16[1] = b; - t.u16[2] = c; - t.u16[3] = d; - } else { - t.u16[3] = a; - t.u16[2] = b; - t.u16[1] = c; - t.u16[0] = d; - } - return t; -} - -SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { - uint32_t t; - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&t; - int c; - for (c = 0; c < 4; c++) q[c] = pp[c]; - return t; -} - -SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&a; - int c; - for (c = 0; c < 4; c++) pp[c] = q[c]; -} - -SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { - if (SIMD_CHECK && (uintptr_t)p & 3) { - fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); - abort(); - } - return c_u32_load_unaligned(p); -} - -SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { - if (SIMD_CHECK && (uintptr_t)p & 3) { - fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); - abort(); - } - c_u32_store_unaligned(p, a); -} - -SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { - c_v64 t; - uint8_t *pp = (uint8_t *)p; - uint8_t *q = (uint8_t *)&t; - int c; - for (c = 0; c < 8; c++) q[c] = pp[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { - if (SIMD_CHECK && (uintptr_t)p & 7) { - fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); - abort(); - } - return c_v64_load_unaligned(p); -} - -SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { - uint8_t *q = (uint8_t *)p; - uint8_t *r = (uint8_t *)&a; - int c; - for (c = 0; c < 8; c++) q[c] = r[c]; -} - -SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { - if (SIMD_CHECK && (uintptr_t)p & 7) { - fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); - abort(); - } - c_v64_store_unaligned(p, a); -} - -SIMD_INLINE c_v64 c_v64_zero() { - c_v64 t; - t.u64 = 0; - return t; -} - -SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { - c_v64 t; - t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = - t.u8[7] = x; - return t; -} - -SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { - c_v64 t; - t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; - return t; -} - -SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { - c_v64 t; - t.u32[0] = t.u32[1] = x; - return t; -} - -SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) - t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255 - ? 255 - : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0 - ? 0 - : (int16_t)a.u8[c] + (int16_t)b.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) - t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127 - ? 127 - : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128 - ? -128 - : (int16_t)a.s8[c] + (int16_t)b.s8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) - t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767 - ? 32767 - : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768 - ? -32768 - : (int32_t)a.s16[c] + (int32_t)b.s16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { - c_v64 t; - t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); - t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); - return t; -} - -SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) { - int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; - t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d); - } - return t; -} - -SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) - t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768 - ? -32768 - : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767 - ? 32767 - : (int32_t)a.s16[c] - (int32_t)b.s16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) - t.u16[c] = - (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { - c_v64 t; - t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); - t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); - return t; -} - -SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) - t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]; - return t; -} - -SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { - c_v64 t; - if (mode) { - t.u8[7] = a.u8[7]; - t.u8[6] = b.u8[7]; - t.u8[5] = a.u8[6]; - t.u8[4] = b.u8[6]; - t.u8[3] = a.u8[5]; - t.u8[2] = b.u8[5]; - t.u8[1] = a.u8[4]; - t.u8[0] = b.u8[4]; - } else { - t.u8[7] = a.u8[3]; - t.u8[6] = b.u8[3]; - t.u8[5] = a.u8[2]; - t.u8[4] = b.u8[2]; - t.u8[3] = a.u8[1]; - t.u8[2] = b.u8[1]; - t.u8[1] = a.u8[0]; - t.u8[0] = b.u8[0]; - } - return t; -} - -SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); -} - -SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); -} - -SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { - c_v64 t; - if (mode) { - t.u16[3] = a.u16[3]; - t.u16[2] = b.u16[3]; - t.u16[1] = a.u16[2]; - t.u16[0] = b.u16[2]; - } else { - t.u16[3] = a.u16[1]; - t.u16[2] = b.u16[1]; - t.u16[1] = a.u16[0]; - t.u16[0] = b.u16[0]; - } - return t; -} - -SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); -} - -SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); -} - -SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { - c_v64 t; - if (mode) { - t.u32[1] = a.u32[1]; - t.u32[0] = b.u32[1]; - } else { - t.u32[1] = a.u32[0]; - t.u32[0] = b.u32[0]; - } - return t; -} - -SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); -} - -SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); -} - -SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { - c_v64 t; - if (mode) { - t.u8[7] = b.u8[7]; - t.u8[6] = b.u8[5]; - t.u8[5] = b.u8[3]; - t.u8[4] = b.u8[1]; - t.u8[3] = a.u8[7]; - t.u8[2] = a.u8[5]; - t.u8[1] = a.u8[3]; - t.u8[0] = a.u8[1]; - } else { - t.u8[7] = a.u8[6]; - t.u8[6] = a.u8[4]; - t.u8[5] = a.u8[2]; - t.u8[4] = a.u8[0]; - t.u8[3] = b.u8[6]; - t.u8[2] = b.u8[4]; - t.u8[1] = b.u8[2]; - t.u8[0] = b.u8[0]; - } - return t; -} - -SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); -} - -SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); -} - -SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { - c_v64 t; - if (mode) { - t.u16[3] = b.u16[3]; - t.u16[2] = b.u16[1]; - t.u16[1] = a.u16[3]; - t.u16[0] = a.u16[1]; - } else { - t.u16[3] = a.u16[2]; - t.u16[2] = a.u16[0]; - t.u16[1] = b.u16[2]; - t.u16[0] = b.u16[0]; - } - return t; -} - -SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) - : _c_v64_unzip_16(a, b, 0); -} - -SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { - return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) - : _c_v64_unzip_16(b, a, 1); -} - -SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { - c_v64 t; - int endian = !!CONFIG_BIG_ENDIAN * 4; - t.s16[3] = (int16_t)a.u8[3 + endian]; - t.s16[2] = (int16_t)a.u8[2 + endian]; - t.s16[1] = (int16_t)a.u8[1 + endian]; - t.s16[0] = (int16_t)a.u8[0 + endian]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { - c_v64 t; - int endian = !!CONFIG_BIG_ENDIAN * 4; - t.s16[3] = (int16_t)a.u8[7 - endian]; - t.s16[2] = (int16_t)a.u8[6 - endian]; - t.s16[1] = (int16_t)a.u8[5 - endian]; - t.s16[0] = (int16_t)a.u8[4 - endian]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { - c_v64 t; - int endian = !!CONFIG_BIG_ENDIAN * 4; - t.s16[3] = (int16_t)a.s8[3 + endian]; - t.s16[2] = (int16_t)a.s8[2 + endian]; - t.s16[1] = (int16_t)a.s8[1 + endian]; - t.s16[0] = (int16_t)a.s8[0 + endian]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { - c_v64 t; - int endian = !!CONFIG_BIG_ENDIAN * 4; - t.s16[3] = (int16_t)a.s8[7 - endian]; - t.s16[2] = (int16_t)a.s8[6 - endian]; - t.s16[1] = (int16_t)a.s8[5 - endian]; - t.s16[0] = (int16_t)a.s8[4 - endian]; - return t; -} - -SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { - c_v64 t; - if (CONFIG_BIG_ENDIAN) { - c_v64 u = a; - a = b; - b = u; - } - t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1]; - t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0]; - t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1]; - t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0]; - return t; -} - -SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { - c_v64 t; - if (CONFIG_BIG_ENDIAN) { - c_v64 u = a; - a = b; - b = u; - } - t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1]; - t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0]; - t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1]; - t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0]; - return t; -} - -SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { - c_v64 t; - if (CONFIG_BIG_ENDIAN) { - c_v64 u = a; - a = b; - b = u; - } - t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3]; - t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2]; - t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1]; - t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0]; - t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3]; - t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2]; - t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1]; - t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0]; - return t; -} - -SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { - c_v64 t; - if (CONFIG_BIG_ENDIAN) { - c_v64 u = a; - a = b; - b = u; - } - t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]; - t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]; - t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]; - t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]; - t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]; - t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]; - t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]; - t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { - c_v64 t; - t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; - t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { - c_v64 t; - t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; - t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { - c_v64 t; - t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; - t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; - return t; -} - -SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { - c_v64 t; - t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; - t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; - return t; -} - -SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) { - if (SIMD_CHECK && (pattern.u8[c] & ~7)) { - fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", - pattern.u8[c], c); - abort(); - } - t.u8[c] = - a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; - } - return t; -} - -SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { - return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + - a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + - a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; -} - -SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { - return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + - (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); -} - -SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { - return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + - a.u8[0]; -} - -SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { - return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; -} - -typedef uint32_t c_sad64_internal; - -/* Implementation dependent return value. Result must be finalised with - v64_sad_u8_sum(). - The result for more than 32 v64_sad_u8() calls is undefined. */ -SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; } - -SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, - c_v64 b) { - int c; - for (c = 0; c < 8; c++) - s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; - return s; -} - -SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; } - -typedef uint32_t c_ssd64_internal; - -/* Implementation dependent return value. Result must be finalised with - * v64_ssd_u8_sum(). */ -SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; } - -SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, - c_v64 b) { - int c; - for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); - return s; -} - -SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } - -SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { - c_v64 t; - t.u64 = a.u64 | b.u64; - return t; -} - -SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { - c_v64 t; - t.u64 = a.u64 ^ b.u64; - return t; -} - -SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { - c_v64 t; - t.u64 = a.u64 & b.u64; - return t; -} - -SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { - c_v64 t; - t.u64 = a.u64 & ~b.u64; - return t; -} - -SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; - return t; -} - -SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { - c_v64 t; - t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); - t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); - return t; -} - -SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { - c_v64 t; - t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; - t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; - return t; -} - -SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { - c_v64 t; - int32_t u; - u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; - t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; - u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; - t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; - u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; - t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; - u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; - t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; - return t; -} - -SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; - return t; -} - -SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; - return t; -} - -SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; - return t; -} - -SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; - return t; -} - -SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; - return t; -} - -SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { - c_v64 t; - int c; - for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); - return t; -} - -SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 7) { - fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); - abort(); - } - for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 7) { - fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); - abort(); - } - for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 7) { - fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); - abort(); - } - for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 15) { - fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); - abort(); - } - for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 15) { - fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); - abort(); - } - for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { - c_v64 t; - int c; - if (SIMD_CHECK && n > 15) { - fprintf(stderr, "Error: undefined s16 shift right %d\n", n); - abort(); - } - for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { - c_v64 t; - if (SIMD_CHECK && n > 31) { - fprintf(stderr, "Error: undefined u32 shift left %d\n", n); - abort(); - } - t.u32[1] = a.u32[1] << n; - t.u32[0] = a.u32[0] << n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { - c_v64 t; - if (SIMD_CHECK && n > 31) { - fprintf(stderr, "Error: undefined u32 shift right %d\n", n); - abort(); - } - t.u32[1] = a.u32[1] >> n; - t.u32[0] = a.u32[0] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { - c_v64 t; - if (SIMD_CHECK && n > 31) { - fprintf(stderr, "Error: undefined s32 shift right %d\n", n); - abort(); - } - t.s32[1] = a.s32[1] >> n; - t.s32[0] = a.s32[0] >> n; - return t; -} - -SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { - c_v64 t; - t.u64 = x.u64 >> i * 8; - return t; -} - -SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { - c_v64 t; - t.u64 = x.u64 << i * 8; - return t; -} - -SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { - if (SIMD_CHECK && c > 7) { - fprintf(stderr, "Error: undefined alignment %d\n", c); - abort(); - } - return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; -} - -SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { - return c_v64_shl_8(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { - return c_v64_shr_u8(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { - return c_v64_shr_s8(a, c); -} - -SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { - return c_v64_shl_16(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { - return c_v64_shr_u16(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { - return c_v64_shr_s16(a, c); -} - -SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { - return c_v64_shl_32(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { - return c_v64_shr_u32(a, c); -} - -SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { - return c_v64_shr_s32(a, c); -} - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h deleted file mode 100644 index 5f9a57b37..000000000 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h +++ /dev/null @@ -1,491 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ - -#include <emmintrin.h> -#if defined(__SSSE3__) -#include <tmmintrin.h> -#endif -#if defined(__SSE4_1__) -#include <smmintrin.h> -#endif - -typedef __m128i v64; - -SIMD_INLINE uint32_t v64_low_u32(v64 a) { - return (uint32_t)_mm_cvtsi128_si32(a); -} - -SIMD_INLINE uint32_t v64_high_u32(v64 a) { - return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); -} - -SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } - -SIMD_INLINE int32_t v64_high_s32(v64 a) { - return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); -} - -SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { - return _mm_packs_epi32( - _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), - _mm_setzero_si128()); -} - -SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { - return _mm_set_epi32(0, 0, x, y); -} - -SIMD_INLINE v64 v64_from_64(uint64_t x) { -#ifdef __x86_64__ - return _mm_cvtsi64_si128(x); -#else - return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x); -#endif -} - -SIMD_INLINE uint64_t v64_u64(v64 x) { - return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); -} - -SIMD_INLINE uint32_t u32_load_aligned(const void *p) { - return *((uint32_t *)p); -} - -SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { - return *((uint32_t *)p); -} - -SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { - *((uint32_t *)p) = a; -} - -SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { - *((uint32_t *)p) = a; -} - -SIMD_INLINE v64 v64_load_aligned(const void *p) { - return _mm_loadl_epi64((__m128i *)p); -} - -SIMD_INLINE v64 v64_load_unaligned(const void *p) { - return _mm_loadl_epi64((__m128i *)p); -} - -SIMD_INLINE void v64_store_aligned(void *p, v64 a) { - _mm_storel_epi64((__m128i *)p, a); -} - -SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { - _mm_storel_epi64((__m128i *)p, a); -} - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) -#define v64_align(a, b, c) \ - ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) -#else -#define v64_align(a, b, c) \ - ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ - : (b)) -#endif - -SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); } - -SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); } - -SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); } - -SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); } - -SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } - -SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } - -SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } - -SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } - -SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } - -SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } - -SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } - -SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } - -SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } - -SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } - -SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } - -SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } - -SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } - -SIMD_INLINE v64 v64_abs_s16(v64 a) { -#if defined(__SSSE3__) - return _mm_abs_epi16(a); -#else - return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); -#endif -} - -SIMD_INLINE v64 v64_abs_s8(v64 a) { -#if defined(__SSSE3__) - return _mm_abs_epi8(a); -#else - v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); - return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); -#endif -} - -SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } - -SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { - return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); -} - -SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } - -SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { - return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); -} - -SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } - -SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { - return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); -} - -SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { - __m128i t = _mm_unpacklo_epi64(b, a); - return _mm_packs_epi32(t, t); -} - -SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { -#if defined(__SSE4_1__) - __m128i t = _mm_unpacklo_epi64(b, a); - return _mm_packus_epi32(t, t); -#else - int32_t ah = v64_high_u32(a); - int32_t al = v64_low_u32(a); - int32_t bh = v64_high_u32(b); - int32_t bl = v64_low_u32(b); - return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah, - al > 65535 ? 65535 : al < 0 ? 0 : al, - bh > 65535 ? 65535 : bh < 0 ? 0 : bh, - bl > 65535 ? 65535 : bl < 0 ? 0 : bl); -#endif -} - -SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { - __m128i t = _mm_unpacklo_epi64(b, a); - return _mm_packus_epi16(t, t); -} - -SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { - __m128i t = _mm_unpacklo_epi64(b, a); - return _mm_packs_epi16(t, t); -} - -SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), - v64_from_64(0x0f0d0b0907050301LL)); -#else - return _mm_packus_epi16( - _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), - _mm_setzero_si128()); -#endif -} - -SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), - v64_from_64(0x0e0c0a0806040200LL)); -#else - return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); -#endif -} - -SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), - v64_from_64(0x0f0e0b0a07060302LL)); -#else - return _mm_packs_epi32( - _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), - _mm_setzero_si128()); -#endif -} - -SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), - v64_from_64(0x0d0c090805040100LL)); -#else - return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); -} - -SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { - return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); -} - -SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { - return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); -} - -SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { - return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); -} - -SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { - return _mm_unpacklo_epi16(a, _mm_setzero_si128()); -} - -SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { - return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); -} - -SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { - return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); -} - -SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { - return _mm_srli_si128( - _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); -} - -SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { -#if defined(__SSSE3__) - return _mm_shuffle_epi8(x, pattern); -#else - v64 output; - unsigned char *input = (unsigned char *)&x; - unsigned char *index = (unsigned char *)&pattern; - char *selected = (char *)&output; - int counter; - - for (counter = 0; counter < 8; counter++) { - selected[counter] = input[index[counter]]; - } - - return output; -#endif -} - -SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { - __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), - _mm_unpacklo_epi8(b, _mm_setzero_si128())); - t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); - t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); - return (int32_t)v64_low_u32(t); -} - -SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { - __m128i r = _mm_madd_epi16(a, b); -#if defined(__SSE4_1__) && defined(__x86_64__) - __m128i x = _mm_cvtepi32_epi64(r); - return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); -#else - return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + - (int64_t)_mm_cvtsi128_si32(r); -#endif -} - -SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { - return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); -} - -SIMD_INLINE int64_t v64_hadd_s16(v64 a) { - return v64_dotp_s16(a, v64_dup_16(1)); -} - -typedef v64 sad64_internal; - -SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); } - -/* Implementation dependent return value. Result must be finalised with - v64_sad_u8_sum(). - The result for more than 32 v64_sad_u8() calls is undefined. */ -SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { - return _mm_add_epi64(s, _mm_sad_epu8(a, b)); -} - -SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } - -typedef v64 ssd64_internal; - -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); } - -/* Implementation dependent return value. Result must be finalised with - * v64_ssd_u8_sum(). */ -SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { - v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); - v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); - v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); - return _mm_add_epi64( - s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); -} - -SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } - -SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } - -SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } - -SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } - -SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } - -SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } - -SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } - -SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { -#if defined(__SSE4_1__) - return _mm_mullo_epi32(a, b); -#else - return _mm_unpacklo_epi32( - _mm_mul_epu32(a, b), - _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); -#endif -} - -SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } - -SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { -#if defined(__SSSE3__) - return _mm_maddubs_epi16(a, b); -#else - __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), - _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); - return _mm_packs_epi32(t, t); -#endif -} - -SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } - -SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { - return _mm_sub_epi8(_mm_avg_epu8(a, b), - _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); -} - -SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { - return _mm_sub_epi16(_mm_avg_epu16(a, b), - _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); -} - -SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } - -SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } - -SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } - -SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { -#if defined(__SSE4_1__) - return _mm_min_epi8(a, b); -#else - v64 mask = _mm_cmplt_epi8(a, b); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { -#if defined(__SSE4_1__) - return _mm_max_epi8(a, b); -#else - v64 mask = _mm_cmplt_epi8(b, a); - return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); -#endif -} - -SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } - -SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } - -SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } - -SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } - -SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } - -SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } - -SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } - -SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } - -SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { - return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), - _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { - return _mm_and_si128(_mm_set1_epi8(0xff >> c), - _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); -} - -SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { - return _mm_packs_epi16( - _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a); -} - -SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { - return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { - return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { - return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { - return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { - return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); -} - -SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { - return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); -} - -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) -#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) -#define v64_shl_n_8(a, c) \ - _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) -#define v64_shr_n_u8(a, c) \ - _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) -#define v64_shr_n_s8(a, c) \ - _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) -#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) -#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) -#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) -#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) -#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) -#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c deleted file mode 100644 index 249394807..000000000 --- a/third_party/aom/aom_dsp/sse.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -/* Sum the difference between every corresponding element of the buffers. */ - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int y, x; - int64_t sse = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const int32_t diff = abs(a[x] - b[x]); - sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } - return sse; -} - -int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { - int y, x; - int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); - sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } - return sse; -} diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c deleted file mode 100644 index 681770ba9..000000000 --- a/third_party/aom/aom_dsp/ssim.c +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <math.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/ssim.h" -#include "aom_ports/mem.h" -#include "aom_ports/system_state.h" - -void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, - uint32_t *sum_sq_s, uint32_t *sum_sq_r, - uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 16; i++, s += sp, r += rp) { - for (j = 0; j < 16; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} - -void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 8; i++, s += sp, r += rp) { - for (j = 0; j < 8; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} - -void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, - int rp, uint32_t *sum_s, uint32_t *sum_r, - uint32_t *sum_sq_s, uint32_t *sum_sq_r, - uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 8; i++, s += sp, r += rp) { - for (j = 0; j < 8; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} - -static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 -static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 -static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 -static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 -static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 - -static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, - uint32_t sum_sq_r, uint32_t sum_sxr, int count, - uint32_t bd) { - int64_t ssim_n, ssim_d; - int64_t c1, c2; - if (bd == 8) { - // scale the constants by number of pixels - c1 = (cc1 * count * count) >> 12; - c2 = (cc2 * count * count) >> 12; - } else if (bd == 10) { - c1 = (cc1_10 * count * count) >> 12; - c2 = (cc2_10 * count * count) >> 12; - } else if (bd == 12) { - c1 = (cc1_12 * count * count) >> 12; - c2 = (cc2_12 * count * count) >> 12; - } else { - c1 = c2 = 0; - assert(0); - } - - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); - - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); - - return ssim_n * 1.0 / ssim_d; -} - -static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { - uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; - aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, - &sum_sxr); - return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); -} - -static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, - int rp, uint32_t bd, uint32_t shift) { - uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; - aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, - &sum_sxr); - return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), - sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); -} - -// We are using a 8x8 moving window with starting location of each 8x8 window -// on the 4x4 pixel grid. Such arrangement allows the windows to overlap -// block boundaries to penalize blocking artifacts. -static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, - int stride_img1, int stride_img2, int width, - int height) { - int i, j; - int samples = 0; - double ssim_total = 0; - - // sample point start with each 4x4 location - for (i = 0; i <= height - 8; - i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { - for (j = 0; j <= width - 8; j += 4) { - double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); - ssim_total += v; - samples++; - } - } - ssim_total /= samples; - return ssim_total; -} - -static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, - int stride_img1, int stride_img2, int width, - int height, uint32_t bd, uint32_t shift) { - int i, j; - int samples = 0; - double ssim_total = 0; - - // sample point start with each 4x4 location - for (i = 0; i <= height - 8; - i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { - for (j = 0; j <= width - 8; j += 4) { - double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, - CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, - shift); - ssim_total += v; - samples++; - } - } - ssim_total /= samples; - return ssim_total; -} - -double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight) { - double abc[3]; - for (int i = 0; i < 3; ++i) { - const int is_uv = i > 0; - abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], - source->strides[is_uv], dest->strides[is_uv], - source->crop_widths[is_uv], source->crop_heights[is_uv]); - } - - *weight = 1; - return abc[0] * .8 + .1 * (abc[1] + abc[2]); -} - -// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity -// -// Re working out the math -> -// -// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / -// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) -// -// mean(x) = sum(x) / n -// -// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) -// -// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) -// -// ssim(x,y) = -// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / -// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * -// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ -// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) -// -// factoring out n*n -// -// ssim(x,y) = -// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / -// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * -// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) -// -// Replace c1 with n*n * c1 for the final step that leads to this code: -// The final step scales by 12 bits so we don't lose precision in the constants. - -static double ssimv_similarity(const Ssimv *sv, int64_t n) { - // Scale the constants by number of pixels. - const int64_t c1 = (cc1 * n * n) >> 12; - const int64_t c2 = (cc2 * n * n) >> 12; - - const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / - (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); - - // Since these variables are unsigned sums, convert to double so - // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / - (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + - n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); - - return l * v; -} - -// The first term of the ssim metric is a luminance factor. -// -// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) -// -// This luminance factor is super sensitive to the dark side of luminance -// values and completely insensitive on the white side. check out 2 sets -// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 -// 2*250*252/ (250^2+252^2) => .99999997 -// -// As a result in this tweaked version of the calculation in which the -// luminance is taken as percentage off from peak possible. -// -// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count -// -static double ssimv_similarity2(const Ssimv *sv, int64_t n) { - // Scale the constants by number of pixels. - const int64_t c1 = (cc1 * n * n) >> 12; - const int64_t c2 = (cc2 * n * n) >> 12; - - const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; - const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); - - // Since these variables are unsigned, sums convert to double so - // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / - (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + - n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); - - return l * v; -} -static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, Ssimv *sv) { - aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, - &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); -} - -double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, int width, int height, Ssimv *sv2, - Metrics *m, int do_inconsistency) { - double dssim_total = 0; - double ssim_total = 0; - double ssim2_total = 0; - double inconsistency_total = 0; - int i, j; - int c = 0; - double norm; - double old_ssim_total = 0; - aom_clear_system_state(); - // We can sample points as frequently as we like start with 1 per 4x4. - for (i = 0; i < height; - i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { - for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0, 0, 0, 0, 0, 0 }; - double ssim; - double ssim2; - double dssim; - uint32_t var_new; - uint32_t var_old; - uint32_t mean_new; - uint32_t mean_old; - double ssim_new; - double ssim_old; - - // Not sure there's a great way to handle the edge pixels - // in ssim when using a window. Seems biased against edge pixels - // however you handle this. This uses only samples that are - // fully in the frame. - if (j + 8 <= width && i + 8 <= height) { - ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); - } - - ssim = ssimv_similarity(&sv, 64); - ssim2 = ssimv_similarity2(&sv, 64); - - sv.ssim = ssim2; - - // dssim is calculated to use as an actual error metric and - // is scaled up to the same range as sum square error. - // Since we are subsampling every 16th point maybe this should be - // *16 ? - dssim = 255 * 255 * (1 - ssim2) / 2; - - // Here I introduce a new error metric: consistency-weighted - // SSIM-inconsistency. This metric isolates frames where the - // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much - // sharper or blurrier than the others. Higher values indicate a - // temporally inconsistent SSIM. There are two ideas at work: - // - // 1) 'SSIM-inconsistency': the total inconsistency value - // reflects how much SSIM values are changing between this - // source / reference frame pair and the previous pair. - // - // 2) 'consistency-weighted': weights de-emphasize areas in the - // frame where the scene content has changed. Changes in scene - // content are detected via changes in local variance and local - // mean. - // - // Thus the overall measure reflects how inconsistent the SSIM - // values are, over consistent regions of the frame. - // - // The metric has three terms: - // - // term 1 -> uses change in scene Variance to weight error score - // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) - // larger changes from one frame to the next mean we care - // less about consistency. - // - // term 2 -> uses change in local scene luminance to weight error - // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) - // larger changes from one frame to the next mean we care - // less about consistency. - // - // term3 -> measures inconsistency in ssim scores between frames - // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). - // - // This term compares the ssim score for the same location in 2 - // subsequent frames. - var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; - var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; - mean_new = sv.sum_s; - mean_old = sv2[c].sum_s; - ssim_new = sv.ssim; - ssim_old = sv2[c].ssim; - - if (do_inconsistency) { - // We do the metric once for every 4x4 block in the image. Since - // we are scaling the error to SSE for use in a psnr calculation - // 1.0 = 4x4x255x255 the worst error we can possibly have. - static const double kScaling = 4. * 4 * 255 * 255; - - // The constants have to be non 0 to avoid potential divide by 0 - // issues other than that they affect kind of a weighting between - // the terms. No testing of what the right terms should be has been - // done. - static const double c1 = 1, c2 = 1, c3 = 1; - - // This measures how much consistent variance is in two consecutive - // source frames. 1.0 means they have exactly the same variance. - const double variance_term = - (2.0 * var_old * var_new + c1) / - (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); - - // This measures how consistent the local mean are between two - // consecutive frames. 1.0 means they have exactly the same mean. - const double mean_term = - (2.0 * mean_old * mean_new + c2) / - (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); - - // This measures how consistent the ssims of two - // consecutive frames is. 1.0 means they are exactly the same. - double ssim_term = - pow((2.0 * ssim_old * ssim_new + c3) / - (ssim_old * ssim_old + ssim_new * ssim_new + c3), - 5); - - double this_inconsistency; - - // Floating point math sometimes makes this > 1 by a tiny bit. - // We want the metric to scale between 0 and 1.0 so we can convert - // it to an snr scaled value. - if (ssim_term > 1) ssim_term = 1; - - // This converts the consistency metric to an inconsistency metric - // ( so we can scale it like psnr to something like sum square error. - // The reason for the variance and mean terms is the assumption that - // if there are big changes in the source we shouldn't penalize - // inconsistency in ssim scores a bit less as it will be less visible - // to the user. - this_inconsistency = (1 - ssim_term) * variance_term * mean_term; - - this_inconsistency *= kScaling; - inconsistency_total += this_inconsistency; - } - sv2[c] = sv; - ssim_total += ssim; - ssim2_total += ssim2; - dssim_total += dssim; - - old_ssim_total += ssim_old; - } - old_ssim_total += 0; - } - - norm = 1. / (width / 4) / (height / 4); - ssim_total *= norm; - ssim2_total *= norm; - m->ssim2 = ssim2_total; - m->ssim = ssim_total; - if (old_ssim_total == 0) inconsistency_total = 0; - - m->ssimc = inconsistency_total; - - m->dssim = dssim_total; - return inconsistency_total; -} - -double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight, - uint32_t bd, uint32_t in_bd) { - assert(bd >= in_bd); - const uint32_t shift = bd - in_bd; - - double abc[3]; - for (int i = 0; i < 3; ++i) { - const int is_uv = i > 0; - abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], - source->strides[is_uv], dest->strides[is_uv], - source->crop_widths[is_uv], - source->crop_heights[is_uv], in_bd, shift); - } - - *weight = 1; - return abc[0] * .8 + .1 * (abc[1] + abc[2]); -} diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h deleted file mode 100644 index 55038f4c2..000000000 --- a/third_party/aom/aom_dsp/ssim.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SSIM_H_ -#define AOM_AOM_DSP_SSIM_H_ - -#define MAX_SSIM_DB 100.0; - -#ifdef __cplusplus -extern "C" { -#endif - -#include "config/aom_config.h" - -#include "aom_scale/yv12config.h" - -// metrics used for calculating ssim, ssim2, dssim, and ssimc -typedef struct { - // source sum ( over 8x8 region ) - uint32_t sum_s; - - // reference sum (over 8x8 region ) - uint32_t sum_r; - - // source sum squared ( over 8x8 region ) - uint32_t sum_sq_s; - - // reference sum squared (over 8x8 region ) - uint32_t sum_sq_r; - - // sum of source times reference (over 8x8 region) - uint32_t sum_sxr; - - // calculated ssim score between source and reference - double ssim; -} Ssimv; - -// metrics collected on a frame basis -typedef struct { - // ssim consistency error metric ( see code for explanation ) - double ssimc; - - // standard ssim - double ssim; - - // revised ssim ( see code for explanation) - double ssim2; - - // ssim restated as an error metric like sse - double dssim; - - // dssim converted to decibels - double dssimd; - - // ssimc converted to decibels - double ssimcd; -} Metrics; - -double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, int width, int height, Ssimv *sv2, - Metrics *m, int do_inconsistency); - -double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight); - -double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *ssim_y, - double *ssim_u, double *ssim_v, uint32_t bd, - uint32_t in_bd); - -double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight, - uint32_t bd, uint32_t in_bd); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c deleted file mode 100644 index 2f6da96e5..000000000 --- a/third_party/aom/aom_dsp/subtract.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -void aom_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src, - ptrdiff_t src_stride, const uint8_t *pred, - ptrdiff_t pred_stride) { - int r, c; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} - -void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src8, - ptrdiff_t src_stride, const uint8_t *pred8, - ptrdiff_t pred_stride, int bd) { - int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - (void)bd; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; - } - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c deleted file mode 100644 index 44ec41f2e..000000000 --- a/third_party/aom/aom_dsp/sum_squares.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, - int height) { - int r, c; - uint64_t ss = 0; - - for (r = 0; r < height; r++) { - for (c = 0; c < width; c++) { - const int16_t v = src[c]; - ss += v * v; - } - src += src_stride; - } - - return ss; -} - -uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { - uint64_t ss = 0; - do { - const int16_t v = *src++; - ss += v * v; - } while (--n); - - return ss; -} diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h deleted file mode 100644 index f98242840..000000000 --- a/third_party/aom/aom_dsp/txfm_common.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_TXFM_COMMON_H_ -#define AOM_AOM_DSP_TXFM_COMMON_H_ - -#include "aom_dsp/aom_dsp_common.h" -#include "av1/common/enums.h" - -// Constants and Macros used by all idct/dct functions -#define DCT_CONST_BITS 14 -#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) - -#define UNIT_QUANT_SHIFT 2 -#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) - -typedef struct txfm_param { - // for both forward and inverse transforms - TX_TYPE tx_type; - TX_SIZE tx_size; - int lossless; - int bd; - // are the pixel buffers octets or shorts? This should collapse to - // bd==8 implies !is_hbd, but that's not certain right now. - int is_hbd; - TxSetType tx_set_type; - // for inverse transforms only - int eob; -} TxfmParam; - -// Constants: -// for (int i = 1; i< 32; ++i) -// printf("static const int cospi_%d_64 = %.0f;\n", i, -// round(16384 * cos(i*M_PI/64))); -// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const tran_high_t cospi_1_64 = 16364; -static const tran_high_t cospi_2_64 = 16305; -static const tran_high_t cospi_3_64 = 16207; -static const tran_high_t cospi_4_64 = 16069; -static const tran_high_t cospi_5_64 = 15893; -static const tran_high_t cospi_6_64 = 15679; -static const tran_high_t cospi_7_64 = 15426; -static const tran_high_t cospi_8_64 = 15137; -static const tran_high_t cospi_9_64 = 14811; -static const tran_high_t cospi_10_64 = 14449; -static const tran_high_t cospi_11_64 = 14053; -static const tran_high_t cospi_12_64 = 13623; -static const tran_high_t cospi_13_64 = 13160; -static const tran_high_t cospi_14_64 = 12665; -static const tran_high_t cospi_15_64 = 12140; -static const tran_high_t cospi_16_64 = 11585; -static const tran_high_t cospi_17_64 = 11003; -static const tran_high_t cospi_18_64 = 10394; -static const tran_high_t cospi_19_64 = 9760; -static const tran_high_t cospi_20_64 = 9102; -static const tran_high_t cospi_21_64 = 8423; -static const tran_high_t cospi_22_64 = 7723; -static const tran_high_t cospi_23_64 = 7005; -static const tran_high_t cospi_24_64 = 6270; -static const tran_high_t cospi_25_64 = 5520; -static const tran_high_t cospi_26_64 = 4756; -static const tran_high_t cospi_27_64 = 3981; -static const tran_high_t cospi_28_64 = 3196; -static const tran_high_t cospi_29_64 = 2404; -static const tran_high_t cospi_30_64 = 1606; -static const tran_high_t cospi_31_64 = 804; - -// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 -static const tran_high_t sinpi_1_9 = 5283; -static const tran_high_t sinpi_2_9 = 9929; -static const tran_high_t sinpi_3_9 = 13377; -static const tran_high_t sinpi_4_9 = 15212; - -// 16384 * sqrt(2) -static const tran_high_t Sqrt2 = 23170; -static const tran_high_t InvSqrt2 = 11585; - -static INLINE tran_high_t fdct_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return rv; -} - -#endif // AOM_AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c deleted file mode 100644 index 23b715309..000000000 --- a/third_party/aom/aom_dsp/variance.c +++ /dev/null @@ -1,1579 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <assert.h> -#include <stdlib.h> -#include <string.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/blend.h" -#include "aom_dsp/variance.h" - -#include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" -#include "av1/common/reconinter.h" - -uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { - int distortion = 0; - int r, c; - - for (r = 0; r < 4; ++r) { - for (c = 0; c < 4; ++c) { - int diff = a[c] - b[c]; - distortion += diff * diff; - } - - a += a_stride; - b += b_stride; - } - - return distortion; -} - -uint32_t aom_get_mb_ss_c(const int16_t *a) { - unsigned int i, sum = 0; - - for (i = 0; i < 256; ++i) { - sum += a[i] * a[i]; - } - - return sum; -} - -static void variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, int *sum) { - int i, j; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - -uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h) { - uint32_t sse; - int sum; - variance(a, a_stride, b, b_stride, w, h, &sse, &sum); - return sse; -} - -// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal -// or vertical direction to produce the filtered output block. Used to implement -// the first-pass of 2-D separable filter. -// -// Produces int16_t output to retain precision for the next pass. Two filter -// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is -// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). -// It defines the offset required to move from one input to the next. -void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - - ++a; - } - - a += src_pixels_per_line - output_width; - b += output_width; - } -} - -// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal -// or vertical direction to produce the filtered output block. Used to implement -// the second-pass of 2-D separable filter. -// -// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two -// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the -// filter is applied horizontally (pixel_step = 1) or vertically -// (pixel_step = stride). It defines the offset required to move from one input -// to the next. Output is 8-bit. -void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; - } - - a += src_pixels_per_line - output_width; - b += output_width; - } -} - -#define VAR(W, H) \ - uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } - -#define SUBPIX_VAR(W, H) \ - uint32_t aom_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ - } - -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ - \ - return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ - } \ - uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ - \ - return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ - } - -/* Identical to the variance call except it takes an additional parameter, sum, - * and returns that value using pass-by-reference instead of returning - * sse - sum^2 / w*h - */ -#define GET_VAR(W, H) \ - void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ - } - -/* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in - * variable. - */ -#define MSE(W, H) \ - uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ - } - -/* All three forms of the variance are available in the same sizes. */ -#define VARIANCES(W, H) \ - VAR(W, H) \ - SUBPIX_VAR(W, H) \ - SUBPIX_AVG_VAR(W, H) - -VARIANCES(128, 128) -VARIANCES(128, 64) -VARIANCES(64, 128) -VARIANCES(64, 64) -VARIANCES(64, 32) -VARIANCES(32, 64) -VARIANCES(32, 32) -VARIANCES(32, 16) -VARIANCES(16, 32) -VARIANCES(16, 16) -VARIANCES(16, 8) -VARIANCES(8, 16) -VARIANCES(8, 8) -VARIANCES(8, 4) -VARIANCES(4, 8) -VARIANCES(4, 4) -VARIANCES(4, 2) -VARIANCES(2, 4) -VARIANCES(2, 2) -VARIANCES(4, 16) -VARIANCES(16, 4) -VARIANCES(8, 32) -VARIANCES(32, 8) -VARIANCES(16, 64) -VARIANCES(64, 16) - -GET_VAR(16, 16) -GET_VAR(8, 8) - -MSE(16, 16) -MSE(16, 8) -MSE(8, 16) -MSE(8, 8) - -void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { - int i, j; - - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - -// Get pred block from up-sampled reference. -void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, int width, int height, - int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, int subpel_search) { - // expect xd == NULL only in tests - if (xd != NULL) { - const MB_MODE_INFO *mi = xd->mi[0]; - const int ref_num = 0; - const int is_intrabc = is_intrabc_block(mi); - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; - const int is_scaled = av1_is_scaled(sf); - - if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); - const struct buf_2d *const dst_buf = &pd->dst; - const struct buf_2d *const pre_buf = - is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = - av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); - - return; - } - } - - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - - if (!subpel_x_q3 && !subpel_y_q3) { - for (int i = 0; i < height; i++) { - memcpy(comp_pred, ref, width * sizeof(*comp_pred)); - comp_pred += width; - ref += ref_stride; - } - } else if (!subpel_y_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, - -1, width, height); - } else if (!subpel_x_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, - 16, width, height); - } else { - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - const int16_t *const kernel_x = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - const int16_t *const kernel_y = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); - } -} - -void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride, int subpel_search) { - int i, j; - - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); - } - comp_pred += width; - pred += width; - } -} - -void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { - int i, j; - const int fwd_offset = jcp_param->fwd_offset; - const int bck_offset = jcp_param->bck_offset; - - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; - tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); - comp_pred[j] = (uint8_t)tmp; - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - -void aom_jnt_comp_avg_upsampled_pred_c( - MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { - int i, j; - const int fwd_offset = jcp_param->fwd_offset; - const int bck_offset = jcp_param->bck_offset; - - aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, - subpel_search); - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; - tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); - comp_pred[j] = (uint8_t)tmp; - } - comp_pred += width; - pred += width; - } -} - -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, int64_t *sum) { - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - int64_t tsum = 0; - uint64_t tsse = 0; - for (int i = 0; i < h; ++i) { - int32_t lsum = 0; - for (int j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; - lsum += diff; - tsse += (uint32_t)(diff * diff); - } - tsum += lsum; - a += a_stride; - b += b_stride; - } - *sum = tsum; - *sse = tsse; -} - -uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, int w, int h) { - uint64_t sse; - int64_t sum; - highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); - return sse; -} - -static void highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (uint32_t)sse_long; - *sum = (int)sum_long; -} - -static void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); -} - -static void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); -} - -#define HIGHBD_VAR(W, H) \ - uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -#define HIGHBD_GET_VAR(S) \ - void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } - -#define HIGHBD_MSE(W, H) \ - uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } - -void aom_highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, uint16_t *output_ptr, - unsigned int src_pixels_per_line, int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - unsigned int i, j; - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; ++j) { - output_ptr[j] = ROUND_POWER_OF_TWO( - (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], - FILTER_BITS); - - ++src_ptr; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -void aom_highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, uint16_t *output_ptr, - unsigned int src_pixels_per_line, unsigned int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; ++j) { - output_ptr[j] = ROUND_POWER_OF_TWO( - (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], - FILTER_BITS); - ++src_ptr; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } - -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } - -/* All three forms of the variance are available in the same sizes. */ -#define HIGHBD_VARIANCES(W, H) \ - HIGHBD_VAR(W, H) \ - HIGHBD_SUBPIX_VAR(W, H) \ - HIGHBD_SUBPIX_AVG_VAR(W, H) - -HIGHBD_VARIANCES(128, 128) -HIGHBD_VARIANCES(128, 64) -HIGHBD_VARIANCES(64, 128) -HIGHBD_VARIANCES(64, 64) -HIGHBD_VARIANCES(64, 32) -HIGHBD_VARIANCES(32, 64) -HIGHBD_VARIANCES(32, 32) -HIGHBD_VARIANCES(32, 16) -HIGHBD_VARIANCES(16, 32) -HIGHBD_VARIANCES(16, 16) -HIGHBD_VARIANCES(16, 8) -HIGHBD_VARIANCES(8, 16) -HIGHBD_VARIANCES(8, 8) -HIGHBD_VARIANCES(8, 4) -HIGHBD_VARIANCES(4, 8) -HIGHBD_VARIANCES(4, 4) -HIGHBD_VARIANCES(4, 2) -HIGHBD_VARIANCES(2, 4) -HIGHBD_VARIANCES(2, 2) -HIGHBD_VARIANCES(4, 16) -HIGHBD_VARIANCES(16, 4) -HIGHBD_VARIANCES(8, 32) -HIGHBD_VARIANCES(32, 8) -HIGHBD_VARIANCES(16, 64) -HIGHBD_VARIANCES(64, 16) - -HIGHBD_GET_VAR(8) -HIGHBD_GET_VAR(16) - -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) - -void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { - int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - -void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, - const struct AV1Common *const cm, int mi_row, - int mi_col, const MV *const mv, - uint8_t *comp_pred8, int width, int height, - int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, int bd, - int subpel_search) { - // expect xd == NULL only in tests - if (xd != NULL) { - const MB_MODE_INFO *mi = xd->mi[0]; - const int ref_num = 0; - const int is_intrabc = is_intrabc_block(mi); - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; - const int is_scaled = av1_is_scaled(sf); - - if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); - const struct buf_2d *const dst_buf = &pd->dst; - const struct buf_2d *const pre_buf = - is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = - av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); - - return; - } - } - - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - - if (!subpel_x_q3 && !subpel_y_q3) { - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - for (int i = 0; i < height; i++) { - memcpy(comp_pred, ref, width * sizeof(*comp_pred)); - comp_pred += width; - ref += ref_stride; - } - } else if (!subpel_y_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, - NULL, -1, width, height, bd); - } else if (!subpel_x_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, - kernel, 16, width, height, bd); - } else { - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); - const int16_t *const kernel_x = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - const int16_t *const kernel_y = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, - bd); - } -} - -void aom_highbd_comp_avg_upsampled_pred_c( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, int subpel_search) { - int i, j; - - const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, - height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd, subpel_search); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); - } - comp_pred += width; - pred += width; - } -} - -void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { - int i, j; - const int fwd_offset = jcp_param->fwd_offset; - const int bck_offset = jcp_param->bck_offset; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; - tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); - comp_pred[j] = (uint16_t)tmp; - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - -void aom_highbd_jnt_comp_avg_upsampled_pred_c( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, - int subpel_search) { - int i, j; - const int fwd_offset = jcp_param->fwd_offset; - const int bck_offset = jcp_param->bck_offset; - const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, - height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd, subpel_search); - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; - tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); - comp_pred[j] = (uint16_t)tmp; - } - comp_pred += width; - pred += width; - } -} - -void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride, - const uint8_t *mask, int mask_stride, - int invert_mask) { - int i, j; - const uint8_t *src0 = invert_mask ? pred : ref; - const uint8_t *src1 = invert_mask ? ref : pred; - const int stride0 = invert_mask ? width : ref_stride; - const int stride1 = invert_mask ? ref_stride : width; - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]); - } - comp_pred += width; - src0 += stride0; - src1 += stride1; - mask += mask_stride; - } -} - -void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask, - int subpel_search) { - if (subpel_x_q3 | subpel_y_q3) { - aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, - subpel_search); - ref = comp_pred; - ref_stride = width; - } - aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask, - mask_stride, invert_mask); -} - -#define MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ - invert_mask); \ - return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ - } - -MASK_SUBPIX_VAR(4, 4) -MASK_SUBPIX_VAR(4, 8) -MASK_SUBPIX_VAR(8, 4) -MASK_SUBPIX_VAR(8, 8) -MASK_SUBPIX_VAR(8, 16) -MASK_SUBPIX_VAR(16, 8) -MASK_SUBPIX_VAR(16, 16) -MASK_SUBPIX_VAR(16, 32) -MASK_SUBPIX_VAR(32, 16) -MASK_SUBPIX_VAR(32, 32) -MASK_SUBPIX_VAR(32, 64) -MASK_SUBPIX_VAR(64, 32) -MASK_SUBPIX_VAR(64, 64) -MASK_SUBPIX_VAR(64, 128) -MASK_SUBPIX_VAR(128, 64) -MASK_SUBPIX_VAR(128, 128) -MASK_SUBPIX_VAR(4, 16) -MASK_SUBPIX_VAR(16, 4) -MASK_SUBPIX_VAR(8, 32) -MASK_SUBPIX_VAR(32, 8) -MASK_SUBPIX_VAR(16, 64) -MASK_SUBPIX_VAR(64, 16) - -void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { - int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); - else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); - } - comp_pred += width; - pred += width; - ref += ref_stride; - mask += mask_stride; - } -} - -void aom_highbd_comp_mask_upsampled_pred( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd, int subpel_search) { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, - height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd, subpel_search); - aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, - mask, mask_stride, invert_mask); -} - -#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ - invert_mask); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - ref, ref_stride, sse); \ - } \ - \ - unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ - invert_mask); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - ref, ref_stride, sse); \ - } \ - \ - unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ - invert_mask); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - ref, ref_stride, sse); \ - } - -HIGHBD_MASK_SUBPIX_VAR(4, 4) -HIGHBD_MASK_SUBPIX_VAR(4, 8) -HIGHBD_MASK_SUBPIX_VAR(8, 4) -HIGHBD_MASK_SUBPIX_VAR(8, 8) -HIGHBD_MASK_SUBPIX_VAR(8, 16) -HIGHBD_MASK_SUBPIX_VAR(16, 8) -HIGHBD_MASK_SUBPIX_VAR(16, 16) -HIGHBD_MASK_SUBPIX_VAR(16, 32) -HIGHBD_MASK_SUBPIX_VAR(32, 16) -HIGHBD_MASK_SUBPIX_VAR(32, 32) -HIGHBD_MASK_SUBPIX_VAR(32, 64) -HIGHBD_MASK_SUBPIX_VAR(64, 32) -HIGHBD_MASK_SUBPIX_VAR(64, 64) -HIGHBD_MASK_SUBPIX_VAR(64, 128) -HIGHBD_MASK_SUBPIX_VAR(128, 64) -HIGHBD_MASK_SUBPIX_VAR(128, 128) -HIGHBD_MASK_SUBPIX_VAR(4, 16) -HIGHBD_MASK_SUBPIX_VAR(16, 4) -HIGHBD_MASK_SUBPIX_VAR(8, 32) -HIGHBD_MASK_SUBPIX_VAR(32, 8) -HIGHBD_MASK_SUBPIX_VAR(16, 64) -HIGHBD_MASK_SUBPIX_VAR(64, 16) - -static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, - const int32_t *wsrc, const int32_t *mask, - int w, int h, unsigned int *sse, int *sum) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); - *sum += diff; - *sse += diff * diff; - } - - pre += pre_stride; - wsrc += w; - mask += w; - } -} - -#define OBMC_VAR(W, H) \ - unsigned int aom_obmc_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } - -#define OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ - } - -OBMC_VAR(4, 4) -OBMC_SUBPIX_VAR(4, 4) - -OBMC_VAR(4, 8) -OBMC_SUBPIX_VAR(4, 8) - -OBMC_VAR(8, 4) -OBMC_SUBPIX_VAR(8, 4) - -OBMC_VAR(8, 8) -OBMC_SUBPIX_VAR(8, 8) - -OBMC_VAR(8, 16) -OBMC_SUBPIX_VAR(8, 16) - -OBMC_VAR(16, 8) -OBMC_SUBPIX_VAR(16, 8) - -OBMC_VAR(16, 16) -OBMC_SUBPIX_VAR(16, 16) - -OBMC_VAR(16, 32) -OBMC_SUBPIX_VAR(16, 32) - -OBMC_VAR(32, 16) -OBMC_SUBPIX_VAR(32, 16) - -OBMC_VAR(32, 32) -OBMC_SUBPIX_VAR(32, 32) - -OBMC_VAR(32, 64) -OBMC_SUBPIX_VAR(32, 64) - -OBMC_VAR(64, 32) -OBMC_SUBPIX_VAR(64, 32) - -OBMC_VAR(64, 64) -OBMC_SUBPIX_VAR(64, 64) - -OBMC_VAR(64, 128) -OBMC_SUBPIX_VAR(64, 128) - -OBMC_VAR(128, 64) -OBMC_SUBPIX_VAR(128, 64) - -OBMC_VAR(128, 128) -OBMC_SUBPIX_VAR(128, 128) - -OBMC_VAR(4, 16) -OBMC_SUBPIX_VAR(4, 16) -OBMC_VAR(16, 4) -OBMC_SUBPIX_VAR(16, 4) -OBMC_VAR(8, 32) -OBMC_SUBPIX_VAR(8, 32) -OBMC_VAR(32, 8) -OBMC_SUBPIX_VAR(32, 8) -OBMC_VAR(16, 64) -OBMC_SUBPIX_VAR(16, 64) -OBMC_VAR(64, 16) -OBMC_SUBPIX_VAR(64, 16) - -static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - uint64_t *sse, int64_t *sum) { - int i, j; - uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); - *sum += diff; - *sse += diff * diff; - } - - pre += pre_stride; - wsrc += w; - mask += w; - } -} - -static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); - *sum = (int)sum64; - *sse = (unsigned int)sse64; -} - -static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); - *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); -} - -static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); - *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); -} - -#define HIGHBD_OBMC_VAR(W, H) \ - unsigned int aom_highbd_obmc_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - wsrc, mask, sse); \ - } \ - \ - unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, wsrc, mask, sse); \ - } \ - \ - unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, wsrc, mask, sse); \ - } - -HIGHBD_OBMC_VAR(4, 4) -HIGHBD_OBMC_SUBPIX_VAR(4, 4) - -HIGHBD_OBMC_VAR(4, 8) -HIGHBD_OBMC_SUBPIX_VAR(4, 8) - -HIGHBD_OBMC_VAR(8, 4) -HIGHBD_OBMC_SUBPIX_VAR(8, 4) - -HIGHBD_OBMC_VAR(8, 8) -HIGHBD_OBMC_SUBPIX_VAR(8, 8) - -HIGHBD_OBMC_VAR(8, 16) -HIGHBD_OBMC_SUBPIX_VAR(8, 16) - -HIGHBD_OBMC_VAR(16, 8) -HIGHBD_OBMC_SUBPIX_VAR(16, 8) - -HIGHBD_OBMC_VAR(16, 16) -HIGHBD_OBMC_SUBPIX_VAR(16, 16) - -HIGHBD_OBMC_VAR(16, 32) -HIGHBD_OBMC_SUBPIX_VAR(16, 32) - -HIGHBD_OBMC_VAR(32, 16) -HIGHBD_OBMC_SUBPIX_VAR(32, 16) - -HIGHBD_OBMC_VAR(32, 32) -HIGHBD_OBMC_SUBPIX_VAR(32, 32) - -HIGHBD_OBMC_VAR(32, 64) -HIGHBD_OBMC_SUBPIX_VAR(32, 64) - -HIGHBD_OBMC_VAR(64, 32) -HIGHBD_OBMC_SUBPIX_VAR(64, 32) - -HIGHBD_OBMC_VAR(64, 64) -HIGHBD_OBMC_SUBPIX_VAR(64, 64) - -HIGHBD_OBMC_VAR(64, 128) -HIGHBD_OBMC_SUBPIX_VAR(64, 128) - -HIGHBD_OBMC_VAR(128, 64) -HIGHBD_OBMC_SUBPIX_VAR(128, 64) - -HIGHBD_OBMC_VAR(128, 128) -HIGHBD_OBMC_SUBPIX_VAR(128, 128) - -HIGHBD_OBMC_VAR(4, 16) -HIGHBD_OBMC_SUBPIX_VAR(4, 16) -HIGHBD_OBMC_VAR(16, 4) -HIGHBD_OBMC_SUBPIX_VAR(16, 4) -HIGHBD_OBMC_VAR(8, 32) -HIGHBD_OBMC_SUBPIX_VAR(8, 32) -HIGHBD_OBMC_VAR(32, 8) -HIGHBD_OBMC_SUBPIX_VAR(32, 8) -HIGHBD_OBMC_VAR(16, 64) -HIGHBD_OBMC_SUBPIX_VAR(16, 64) -HIGHBD_OBMC_VAR(64, 16) -HIGHBD_OBMC_SUBPIX_VAR(64, 16) diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h deleted file mode 100644 index 362da29d3..000000000 --- a/third_party/aom/aom_dsp/variance.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_VARIANCE_H_ -#define AOM_AOM_DSP_VARIANCE_H_ - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define FILTER_BITS 7 -#define FILTER_WEIGHT 128 - -typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride); - -typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *second_pred); - -typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, - int b_stride, int n); - -typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *const b_array[], - int b_stride, unsigned int *sad_array); - -typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); - -typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); - -typedef unsigned int (*aom_subp_avg_variance_fn_t)( - const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, - int b_stride, unsigned int *sse, const uint8_t *second_pred); - -typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); - -typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)( - const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, - int b_stride, unsigned int *sse, const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); - -typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred, - const uint8_t *msk, int msk_stride, - int invert_mask); -typedef unsigned int (*aom_masked_subpixvariance_fn_t)( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, - const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); - -void aom_highbd_comp_mask_upsampled_pred( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd, int subpel_search); - -typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, - const int32_t *wsrc, - const int32_t *msk); -typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, - int pred_stride, - const int32_t *wsrc, - const int32_t *msk, - unsigned int *sse); -typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( - const uint8_t *pred, int pred_stride, int xoffset, int yoffset, - const int32_t *wsrc, const int32_t *msk, unsigned int *sse); - -typedef struct aom_variance_vtable { - aom_sad_fn_t sdf; - aom_sad_avg_fn_t sdaf; - aom_variance_fn_t vf; - aom_subpixvariance_fn_t svf; - aom_subp_avg_variance_fn_t svaf; - aom_sad_multi_d_fn_t sdx4df; - aom_masked_sad_fn_t msdf; - aom_masked_subpixvariance_fn_t msvf; - aom_obmc_sad_fn_t osdf; - aom_obmc_variance_fn_t ovf; - aom_obmc_subpixvariance_fn_t osvf; - aom_jnt_sad_avg_fn_t jsdaf; - aom_jnt_subp_avg_variance_fn_t jsvaf; -} aom_variance_fn_ptr_t; - -void aom_highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, uint16_t *output_ptr, - unsigned int src_pixels_per_line, int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter); - -void aom_highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, uint16_t *output_ptr, - unsigned int src_pixels_per_line, unsigned int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter); - -uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h); - -uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, int w, int h); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c deleted file mode 100644 index 5f5bf5f14..000000000 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction aom_filter_block1d16_v8_sse2; -filter8_1dfunction aom_filter_block1d16_h8_sse2; -filter8_1dfunction aom_filter_block1d8_v8_sse2; -filter8_1dfunction aom_filter_block1d8_h8_sse2; -filter8_1dfunction aom_filter_block1d4_v8_sse2; -filter8_1dfunction aom_filter_block1d4_h8_sse2; - -#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 -#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 -#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 -#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 -#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 -#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 - -filter8_1dfunction aom_filter_block1d16_v2_sse2; -filter8_1dfunction aom_filter_block1d16_h2_sse2; -filter8_1dfunction aom_filter_block1d8_v2_sse2; -filter8_1dfunction aom_filter_block1d8_h2_sse2; -filter8_1dfunction aom_filter_block1d4_v2_sse2; -filter8_1dfunction aom_filter_block1d4_h2_sse2; - -// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); - -#if ARCH_X86_64 -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; - -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; - -// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); - -#endif // ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm deleted file mode 100644 index 7283c32b8..000000000 --- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm +++ /dev/null @@ -1,297 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro convolve_fn 1-2 -%ifidn %1, avg -%define AUX_XMM_REGS 4 -%else -%define AUX_XMM_REGS 0 -%endif -%ifidn %2, highbd -%define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ - dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd -%else -%define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ - dst, dst_stride, \ - fx, fxs, fy, fys, w, h -%endif - mov r4d, dword wm -%ifidn %2, highbd - shl r4d, 1 - shl srcq, 1 - shl src_strideq, 1 - shl dstq, 1 - shl dst_strideq, 1 -%else - cmp r4d, 4 - je .w4 -%endif - cmp r4d, 8 - je .w8 - cmp r4d, 16 - je .w16 - cmp r4d, 32 - je .w32 - - cmp r4d, 64 - je .w64 -%ifidn %2, highbd - cmp r4d, 128 - je .w128 - -.w256: - mov r4d, dword hm -.loop256: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - movu m0, [srcq+64] - movu m1, [srcq+80] - movu m2, [srcq+96] - movu m3, [srcq+112] -%ifidn %1, avg - pavg m0, [dstq+64] - pavg m1, [dstq+80] - pavg m2, [dstq+96] - pavg m3, [dstq+112] -%endif - mova [dstq+64], m0 - mova [dstq+80], m1 - mova [dstq+96], m2 - mova [dstq+112], m3 - movu m0, [srcq+128] - movu m1, [srcq+128+16] - movu m2, [srcq+128+32] - movu m3, [srcq+128+48] -%ifidn %1, avg - pavg m0, [dstq+128] - pavg m1, [dstq+128+16] - pavg m2, [dstq+128+32] - pavg m3, [dstq+128+48] -%endif - mova [dstq+128 ], m0 - mova [dstq+128+16], m1 - mova [dstq+128+32], m2 - mova [dstq+128+48], m3 - movu m0, [srcq+128+64] - movu m1, [srcq+128+80] - movu m2, [srcq+128+96] - movu m3, [srcq+128+112] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq+128+64] - pavg m1, [dstq+128+80] - pavg m2, [dstq+128+96] - pavg m3, [dstq+128+112] -%endif - mova [dstq+128+64], m0 - mova [dstq+128+80], m1 - mova [dstq+128+96], m2 - mova [dstq+128+112], m3 - add dstq, dst_strideq - sub r4d, 1 - jnz .loop256 - RET -%endif - -.w128: - mov r4d, dword hm -.loop128: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - movu m0, [srcq+64] - movu m1, [srcq+80] - movu m2, [srcq+96] - movu m3, [srcq+112] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq+64] - pavg m1, [dstq+80] - pavg m2, [dstq+96] - pavg m3, [dstq+112] -%endif - mova [dstq+64], m0 - mova [dstq+80], m1 - mova [dstq+96], m2 - mova [dstq+112], m3 - add dstq, dst_strideq - sub r4d, 1 - jnz .loop128 - RET - -.w64: - mov r4d, dword hm -.loop64: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - add dstq, dst_strideq - sub r4d, 1 - jnz .loop64 - RET - -.w32: - mov r4d, dword hm -.loop32: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+src_strideq] - movu m3, [srcq+src_strideq+16] - lea srcq, [srcq+src_strideq*2] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq +16] - pavg m2, [dstq+dst_strideq] - pavg m3, [dstq+dst_strideq+16] -%endif - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+dst_strideq ], m2 - mova [dstq+dst_strideq+16], m3 - lea dstq, [dstq+dst_strideq*2] - sub r4d, 2 - jnz .loop32 - RET - -.w16: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop16: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+dst_strideq] - pavg m2, [dstq+dst_strideq*2] - pavg m3, [dstq+r6q] -%endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop16 - RET - -.w8: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop8: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] - pavg m0, m4 - pavg m1, m5 - pavg m2, m6 - pavg m3, m7 -%endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop8 - RET - -%ifnidn %2, highbd -.w4: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop4: - movd m0, [srcq] - movd m1, [srcq+src_strideq] - movd m2, [srcq+src_strideq*2] - movd m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - movd m4, [dstq] - movd m5, [dstq+dst_strideq] - movd m6, [dstq+dst_strideq*2] - movd m7, [dstq+r6q] - pavg m0, m4 - pavg m1, m5 - pavg m2, m6 - pavg m3, m7 -%endif - movd [dstq ], m0 - movd [dstq+dst_strideq ], m1 - movd [dstq+dst_strideq*2], m2 - movd [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop4 - RET -%endif -%endmacro - -INIT_XMM sse2 -convolve_fn copy -convolve_fn avg -convolve_fn copy, highbd diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm deleted file mode 100644 index b6f040791..000000000 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm +++ /dev/null @@ -1,613 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro HIGH_GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x00000040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm6 - punpcklwd xmm2, xmm5 - punpcklwd xmm3, xmm4 - punpcklwd xmm1, xmm7 - - movdqa k0k6, xmm0 - movdqa k2k5, xmm2 - movdqa k3k4, xmm3 - movdqa k1k7, xmm1 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - ;Compute max and min values of a pixel - mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps - movq xmm0, rdx - movq xmm1, rcx - pshufd xmm0, xmm0, 0b - movdqa xmm2, xmm0 - psllw xmm0, xmm1 - psubw xmm0, xmm2 - pxor xmm1, xmm1 - movdqa max, xmm0 ;max value (for clamping) - movdqa min, xmm1 ;min value (for clamping) - -%endm - -%macro HIGH_APPLY_FILTER_4 1 - punpcklwd xmm0, xmm6 ;two row in one register - punpcklwd xmm1, xmm7 - punpcklwd xmm2, xmm5 - punpcklwd xmm3, xmm4 - - pmaddwd xmm0, k0k6 ;multiply the filter factors - pmaddwd xmm1, k1k7 - pmaddwd xmm2, k2k5 - pmaddwd xmm3, k3k4 - - paddd xmm0, xmm1 ;sum - paddd xmm0, xmm2 - paddd xmm0, xmm3 - - paddd xmm0, krd ;rounding - psrad xmm0, 7 ;shift - packssdw xmm0, xmm0 ;pack to word - - ;clamp the values - pminsw xmm0, max - pmaxsw xmm0, min - -%if %1 - movq xmm1, [rdi] - pavgw xmm0, xmm1 -%endif - movq [rdi], xmm0 -%endm - -%macro HIGH_GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x00000040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - punpcklwd xmm0, xmm1 - punpckhwd xmm6, xmm7 - punpckhwd xmm2, xmm5 - punpckhwd xmm3, xmm4 - - movdqa k0k1, xmm0 ;store filter factors on stack - movdqa k6k7, xmm6 - movdqa k2k5, xmm2 - movdqa k3k4, xmm3 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - ;Compute max and min values of a pixel - mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps - movq xmm0, rdx - movq xmm1, rcx - pshufd xmm0, xmm0, 0b - movdqa xmm2, xmm0 - psllw xmm0, xmm1 - psubw xmm0, xmm2 - pxor xmm1, xmm1 - movdqa max, xmm0 ;max value (for clamping) - movdqa min, xmm1 ;min value (for clamping) -%endm - -%macro LOAD_VERT_8 1 - movdqu xmm0, [rsi + %1] ;0 - movdqu xmm1, [rsi + rax + %1] ;1 - movdqu xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movdqu xmm7, [rsi + rdx * 2 + %1] ;7 - movdqu xmm2, [rsi + rax + %1] ;2 - movdqu xmm3, [rsi + rax * 2 + %1] ;3 - movdqu xmm4, [rsi + rdx + %1] ;4 - movdqu xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro HIGH_APPLY_FILTER_8 2 - movdqu temp, xmm4 - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm1 - punpckhwd xmm4, xmm1 - movdqa xmm1, xmm6 - punpcklwd xmm6, xmm7 - punpckhwd xmm1, xmm7 - movdqa xmm7, xmm2 - punpcklwd xmm2, xmm5 - punpckhwd xmm7, xmm5 - - movdqu xmm5, temp - movdqu temp, xmm4 - movdqa xmm4, xmm3 - punpcklwd xmm3, xmm5 - punpckhwd xmm4, xmm5 - movdqu xmm5, temp - - pmaddwd xmm0, k0k1 - pmaddwd xmm5, k0k1 - pmaddwd xmm6, k6k7 - pmaddwd xmm1, k6k7 - pmaddwd xmm2, k2k5 - pmaddwd xmm7, k2k5 - pmaddwd xmm3, k3k4 - pmaddwd xmm4, k3k4 - - paddd xmm0, xmm6 - paddd xmm0, xmm2 - paddd xmm0, xmm3 - paddd xmm5, xmm1 - paddd xmm5, xmm7 - paddd xmm5, xmm4 - - paddd xmm0, krd ;rounding - paddd xmm5, krd - psrad xmm0, 7 ;shift - psrad xmm5, 7 - packssdw xmm0, xmm5 ;pack back to word - - ;clamp the values - pminsw xmm0, max - pmaxsw xmm0, min - -%if %1 - movdqu xmm1, [rdi + %2] - pavgw xmm0, xmm1 -%endif - movdqu [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movq xmm0, [rsi] ;load src: row 0 - movq xmm1, [rsi + rax] ;1 - movq xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2] ;7 - movq xmm2, [rsi + rax] ;2 - movq xmm3, [rsi + rax * 2] ;3 - movq xmm4, [rsi + rdx] ;4 - movq xmm5, [rsi + rax * 4] ;5 - - HIGH_APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 16 - HIGH_APPLY_FILTER_8 0, 16 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm4, [rsi + 2] - movdqa xmm1, xmm0 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm4 - - psrldq xmm1, 2 - psrldq xmm6, 4 - psrldq xmm7, 6 - psrldq xmm2, 4 - psrldq xmm3, 6 - psrldq xmm5, 2 - - HIGH_APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 10] ;load src - movdqu xmm1, [rsi + 12] - movdqu xmm2, [rsi + 14] - movdqu xmm3, [rsi + 16] - movdqu xmm4, [rsi + 18] - movdqu xmm5, [rsi + 20] - movdqu xmm6, [rsi + 22] - movdqu xmm7, [rsi + 24] - - HIGH_APPLY_FILTER_8 0, 16 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm deleted file mode 100644 index 7b3fe6419..000000000 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm +++ /dev/null @@ -1,338 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro HIGH_GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x00000040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklwd xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps - movq xmm5, rdx - movq xmm2, rcx - pshufd xmm5, xmm5, 0b - movdqa xmm1, xmm5 - psllw xmm5, xmm2 - psubw xmm5, xmm1 ;max value (for clamping) - pxor xmm2, xmm2 ;min value (for clamping) - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro HIGH_APPLY_FILTER_4 1 - - punpcklwd xmm0, xmm1 ;two row in one register - pmaddwd xmm0, xmm4 ;multiply the filter factors - - paddd xmm0, xmm3 ;rounding - psrad xmm0, 7 ;shift - packssdw xmm0, xmm0 ;pack to word - - ;clamp the values - pminsw xmm0, xmm5 - pmaxsw xmm0, xmm2 - -%if %1 - movq xmm1, [rdi] - pavgw xmm0, xmm1 -%endif - - movq [rdi], xmm0 - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - dec rcx -%endm - -%if ARCH_X86_64 -%macro HIGH_GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x00000040 - - movdqa xmm6, [rdx] ;load filters - - pshuflw xmm7, xmm6, 11111111b ;k3 - pshufhw xmm6, xmm6, 0b ;k4 - psrldq xmm6, 8 - punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps - movq xmm8, rdx - movq xmm5, rcx - pshufd xmm8, xmm8, 0b - movdqa xmm1, xmm8 - psllw xmm8, xmm5 - psubw xmm8, xmm1 ;max value (for clamping) - pxor xmm5, xmm5 ;min value (for clamping) - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro HIGH_APPLY_FILTER_8 1 - movdqa xmm6, xmm0 - punpckhwd xmm6, xmm1 - punpcklwd xmm0, xmm1 - pmaddwd xmm6, xmm7 - pmaddwd xmm0, xmm7 - - paddd xmm6, xmm4 ;rounding - paddd xmm0, xmm4 ;rounding - psrad xmm6, 7 ;shift - psrad xmm0, 7 ;shift - packssdw xmm0, xmm6 ;pack back to word - - ;clamp the values - pminsw xmm0, xmm8 - pmaxsw xmm0, xmm5 - -%if %1 - movdqu xmm1, [rdi] - pavgw xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - dec rcx -%endm - -%macro HIGH_APPLY_FILTER_16 1 - movdqa xmm9, xmm0 - movdqa xmm6, xmm2 - punpckhwd xmm9, xmm1 - punpckhwd xmm6, xmm3 - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - - pmaddwd xmm9, xmm7 - pmaddwd xmm6, xmm7 - pmaddwd xmm0, xmm7 - pmaddwd xmm2, xmm7 - - paddd xmm9, xmm4 ;rounding - paddd xmm6, xmm4 - paddd xmm0, xmm4 - paddd xmm2, xmm4 - - psrad xmm9, 7 ;shift - psrad xmm6, 7 - psrad xmm0, 7 - psrad xmm2, 7 - - packssdw xmm0, xmm9 ;pack back to word - packssdw xmm2, xmm6 ;pack back to word - - ;clamp the values - pminsw xmm0, xmm8 - pmaxsw xmm0, xmm5 - pminsw xmm2, xmm8 - pmaxsw xmm2, xmm5 - -%if %1 - movdqu xmm1, [rdi] - movdqu xmm3, [rdi + 16] - pavgw xmm0, xmm1 - pavgw xmm2, xmm3 -%endif - movdqu [rdi], xmm0 ;store the result - movdqu [rdi + 16], xmm2 ;store the result - - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - dec rcx -%endm -%endif - -SECTION .text - -global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movq xmm0, [rsi] ;load src - movq xmm1, [rsi + 2*rax] - - HIGH_APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + 2*rax] ;1 - - HIGH_APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm2, [rsi + 16] - movdqu xmm1, [rsi + 2*rax] ;1 - movdqu xmm3, [rsi + 2*rax + 16] - - HIGH_APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif - -global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 2 - - HIGH_APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - - HIGH_APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - movdqu xmm2, [rsi + 16] - movdqu xmm3, [rsi + 18] - - HIGH_APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c deleted file mode 100644 index 94b5da171..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ /dev/null @@ -1,1441 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve.h" -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_ports/mem.h" - -#if defined(__clang__) -#if (__clang_major__ > 0 && __clang_major__ < 3) || \ - (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#else // clang > 3.3, and not 5.0 on macosx. -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // clang <= 3.3 -#elif defined(__GNUC__) -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -#else // gcc > 4.7 -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // gcc <= 4.6 -#else // !(gcc || clang) -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ - -static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, - const ptrdiff_t stride, const __m256i *a) { - *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); - *((uint32_t *)(output_ptr + stride)) = - _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); -} - -static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { - __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); - a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); - return a; -} - -static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, - const ptrdiff_t stride, const __m256i *a) { - _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); - _mm_storel_epi64((__m128i *)(output_ptr + stride), - _mm256_extractf128_si256(*a, 1)); -} - -static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { - __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); - a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); - return a; -} - -static INLINE void xx_store2_mi128(const uint8_t *output_ptr, - const ptrdiff_t stride, const __m256i *a) { - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); - _mm_store_si128((__m128i *)(output_ptr + stride), - _mm256_extractf128_si256(*a, 1)); -} - -static void aom_filter_block1d4_h4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - firstFilters = - _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); - filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); - - // multiple the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - - srcRegFilt32b1_1 = - _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = - _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); - - src_ptr += src_stride; - - xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 4 bytes - if (i > 0) { - __m128i srcReg1, srcRegFilt1_1; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - - srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); - // shift by 6 bit each 16 bit - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); - - // save 4 bytes - *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); - } -} - -static void aom_filter_block1d4_h8_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt1Reg, filt2Reg; - __m256i firstFilters, secondFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2; - __m256i srcReg32b1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 32 bits - firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); - // duplicate only the second 32 bits - secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); - - filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); - - // multiple the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - - // filter the source buffer - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - srcRegFilt32b1_1 = - _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = - _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); - - src_ptr += src_stride; - - xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 4 bytes - if (i > 0) { - __m128i srcReg1, srcRegFilt1_1; - __m128i srcRegFilt2; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - - // filter the source buffer - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - - // multiply 4 adjacent elements with the filter and add the result - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); - - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); - // shift by 6 bit each 16 bit - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); - - // save 4 bytes - *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); - } -} - -static void aom_filter_block1d8_h4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt2Reg, filt3Reg; - __m256i secondFilters, thirdFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, filtersReg32; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - - filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - - // multiply the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); - - src_ptr += src_stride; - - xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 8 bytes - if (i > 0) { - __m128i srcReg1, srcRegFilt1_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - - // filter the source buffer - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); - - // shift by 6 bit each 16 bit - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); - - // shrink to 8 bit each 16 bits - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); - - // save 8 bytes - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); - } -} - -static void aom_filter_block1d8_h8_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - // multiple the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = - _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); - - src_ptr += src_stride; - - xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 8 bytes - if (i > 0) { - __m128i srcReg1, srcRegFilt1_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); - - // shift by 6 bit each 16 bit - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); - - // save 8 bytes - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); - } -} - -static void aom_filter_block1d16_h4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt2Reg, filt3Reg; - __m256i secondFilters, thirdFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - - filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - - // multiply the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); - - // reading 2 strides of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = - xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_stride; - - xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - __m256i srcReg1, srcReg12; - __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; - - srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); - srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); - - // filter the source buffer - srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); - srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); - srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); - - // shift by 6 bit each 16 bit - srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); - srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); - srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt1_1)); - } -} - -static void aom_filter_block1d16_h8_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - src_ptr -= 3; - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - // multiple the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i -= 2) { - // load the 2 strides of source - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); - - // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); - - // reading 2 strides of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = - xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); - - // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_stride; - - xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); - output_ptr += dst_stride; - } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); - - // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = - _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); - - // shift by 6 bit each 16 bit - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); - - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); - } -} - -static void aom_filter_block1d8_v4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i filtersReg32, addFilterReg32; - __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; - __m256i srcReg23_34_lo, srcReg45_56_lo; - __m256i resReg23_34_lo, resReg45_56_lo; - __m256i resReglo, resReg; - __m256i secondFilters, thirdFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_srai_epi16(filtersReg, 1); - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); - srcReg4x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); - - // have consecutive loads on the same 256 register - srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); - - srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); - - for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg5x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg45 = - _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); - - srcReg6x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); - srcReg56 = - _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); - - // merge every two consecutive registers - srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); - - // multiply 2 adjacent elements with the filter and add the result - resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); - resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); - - // add and saturate the results together - resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); - - // shift by 6 bit each 16 bit - resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); - resReglo = _mm256_srai_epi16(resReglo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg = _mm256_packus_epi16(resReglo, resReglo); - - src_ptr += src_stride; - - xx_storeu2_epi64(output_ptr, out_pitch, &resReg); - - output_ptr += dst_stride; - - // save part of the registers for next strides - srcReg23_34_lo = srcReg45_56_lo; - srcReg4x = srcReg6x; - } -} - -static void aom_filter_block1d8_v8_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_srai_epi16(filtersReg, 1); - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); - srcReg32b3 = - xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); - srcReg32b5 = - xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); - - // have each consecutive loads on the same 256 register - srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); - srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); - srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - - // shift by 6 bit each 16 bit - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); - - src_ptr += src_stride; - - xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); - - output_ptr += dst_stride; - - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b11 = srcReg32b2; - srcReg32b2 = srcReg32b4; - srcReg32b7 = srcReg32b9; - } - if (i > 0) { - __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; - // load the last 16 bytes - srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); - - // shift by 6 bit each 16 bit - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); - - // save 8 bytes - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); - } -} - -static void aom_filter_block1d16_v4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i filtersReg32, addFilterReg32; - __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; - __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; - __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; - __m256i resReglo, resReghi, resReg; - __m256i secondFilters, thirdFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_srai_epi16(filtersReg, 1); - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); - srcReg4x = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - - // have consecutive loads on the same 256 register - srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); - - srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); - srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); - - for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg5x = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg45 = - _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); - - srcReg6x = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); - srcReg56 = - _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); - - // merge every two consecutive registers - srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); - srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); - - // multiply 2 adjacent elements with the filter and add the result - resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); - resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); - - // add and saturate the results together - resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); - - // multiply 2 adjacent elements with the filter and add the result - resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); - resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); - - // add and saturate the results together - resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); - - // shift by 6 bit each 16 bit - resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); - resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); - resReglo = _mm256_srai_epi16(resReglo, 6); - resReghi = _mm256_srai_epi16(resReghi, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg = _mm256_packus_epi16(resReglo, resReghi); - - src_ptr += src_stride; - - xx_store2_mi128(output_ptr, out_pitch, &resReg); - - output_ptr += dst_stride; - - // save part of the registers for next strides - srcReg23_34_lo = srcReg45_56_lo; - srcReg23_34_hi = srcReg45_56_hi; - srcReg4x = srcReg6x; - } -} - -static void aom_filter_block1d16_v8_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg32; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_srai_epi16(filtersReg, 1); - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); - srcReg32b3 = - xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); - srcReg32b5 = - xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); - - // have each consecutive loads on the same 256 register - srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); - srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); - srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); - - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); - - for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - - // shift by 6 bit each 16 bit - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); - - src_ptr += src_stride; - - xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); - - output_ptr += dst_stride; - - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; - } - if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; - // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = - _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); - - // shift by 6 bit each 16 bit - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); - } -} - -static void aom_filter_block1d4_v4_avx2( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i filtersReg32, addFilterReg32; - __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; - __m256i srcReg23_34_lo, srcReg45_56_lo; - __m256i srcReg2345_3456_lo; - __m256i resReglo, resReg; - __m256i firstFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm256_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_srai_epi16(filtersReg, 1); - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - firstFilters = - _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); - srcReg4x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); - - // have consecutive loads on the same 256 register - srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); - - srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); - - for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg5x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg45 = - _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); - - srcReg6x = _mm256_castsi128_si256( - _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); - srcReg56 = - _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); - - // merge every two consecutive registers - srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); - - srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); - - // multiply 2 adjacent elements with the filter and add the result - resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); - - resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); - - // shift by 6 bit each 16 bit - resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); - resReglo = _mm256_srai_epi16(resReglo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg = _mm256_packus_epi16(resReglo, resReglo); - - src_ptr += src_stride; - - xx_storeu2_epi32(output_ptr, out_pitch, &resReg); - - output_ptr += dst_stride; - - // save part of the registers for next strides - srcReg23_34_lo = srcReg45_56_lo; - srcReg4x = srcReg6x; - } -} - -#if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction aom_filter_block1d4_v8_ssse3; -filter8_1dfunction aom_filter_block1d16_v2_ssse3; -filter8_1dfunction aom_filter_block1d16_h2_ssse3; -filter8_1dfunction aom_filter_block1d8_v2_ssse3; -filter8_1dfunction aom_filter_block1d8_h2_ssse3; -filter8_1dfunction aom_filter_block1d4_v2_ssse3; -filter8_1dfunction aom_filter_block1d4_h2_ssse3; -#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 -#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 -#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 -#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 -#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 -#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 -#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 -// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c deleted file mode 100644 index 325a21b76..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve.h" -#include "aom_mem/aom_mem.h" -#include "aom_ports/mem.h" -#include "aom_ports/emmintrin_compat.h" - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - -// These are reused by the avx2 intrinsics. -filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; - -void aom_filter_block1d4_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the third 16 bit in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the seconds 16 bits in the filter into the second lane - // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the forth 16 bits in the filter into the second lane - // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - // loading the local filters - shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); - shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); - srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); - - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); - - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr += src_pixels_per_line; - - // save only 4 bytes - *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr += src_pixels_per_line; - - // save only 8 bytes - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_v8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - - for (i = 0; i < output_height; i++) { - // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr += src_pitch; - - // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; - - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); - - output_ptr += out_pitch; - } -} - -filter8_1dfunction aom_filter_block1d16_v8_ssse3; -filter8_1dfunction aom_filter_block1d16_h8_ssse3; -filter8_1dfunction aom_filter_block1d8_v8_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_ssse3; -filter8_1dfunction aom_filter_block1d4_v8_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_ssse3; - -#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 -#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 -#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 -#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 -#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 -#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 - -filter8_1dfunction aom_filter_block1d16_v2_ssse3; -filter8_1dfunction aom_filter_block1d16_h2_ssse3; -filter8_1dfunction aom_filter_block1d8_v2_ssse3; -filter8_1dfunction aom_filter_block1d8_h2_ssse3; -filter8_1dfunction aom_filter_block1d4_v2_ssse3; -filter8_1dfunction aom_filter_block1d4_h2_ssse3; - -// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm deleted file mode 100644 index c88fc9ffb..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ /dev/null @@ -1,615 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d4_v8_sse2) PRIVATE -sym(aom_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d8_v8_sse2) PRIVATE -sym(aom_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d16_v8_sse2) PRIVATE -sym(aom_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d4_h8_sse2) PRIVATE -sym(aom_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d8_h8_sse2) PRIVATE -sym(aom_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(aom_filter_block1d16_h8_sse2) PRIVATE -sym(aom_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm deleted file mode 100644 index 3ca7921b6..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ /dev/null @@ -1,870 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_64: times 8 dw 64 -even_byte_mask: times 8 dw 0x00ff - -; %define USE_PMULHRSW -; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss -; when using this instruction. -; -; The add order below (based on ffav1) must be followed to prevent outranges. -; x = k0k1 + k4k5 -; y = k2k3 + k6k7 -; z = signed SAT(x + y) - -SECTION .text -%define LOCAL_VARS_SIZE 16*6 - -%macro SETUP_LOCAL_VARS 0 - ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + - ; pmaddubsw has a higher latency on some platforms, this might be eased by - ; interleaving the instructions. - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - packsswb m4, m4 - ; TODO(slavarnway): multiple pshufb instructions had a higher latency on - ; some platforms. - pshuflw m0, m4, 0b ;k0_k1 - pshuflw m1, m4, 01010101b ;k2_k3 - pshuflw m2, m4, 10101010b ;k4_k5 - pshuflw m3, m4, 11111111b ;k6_k7 - punpcklqdq m0, m0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - mova k0k1, m0 - mova k2k3, m1 - mova k4k5, m2 - mova k6k7, m3 -%if ARCH_X86_64 - %define krd m12 - %define tmp0 [rsp + 16*4] - %define tmp1 [rsp + 16*5] - mova krd, [GLOBAL(pw_64)] -%else - %define krd [rsp + 16*4] -%if CONFIG_PIC=0 - mova m6, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m6, m6 ;all ones - psrlw m6, 15 - psllw m6, 6 ;aka pw_64 -%endif - mova krd, m6 -%endif -%endm - -;------------------------------------------------------------------------------- -%if ARCH_X86_64 - %define LOCAL_VARS_SIZE_H4 0 -%else - %define LOCAL_VARS_SIZE_H4 16*4 -%endif - -%macro SUBPIX_HFILTER4 1 -cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - packsswb m4, m4 -%if ARCH_X86_64 - %define k0k1k4k5 m8 - %define k2k3k6k7 m9 - %define krd m10 - mova krd, [GLOBAL(pw_64)] - pshuflw k0k1k4k5, m4, 0b ;k0_k1 - pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 - pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 - pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 -%else - %define k0k1k4k5 [rsp + 16*0] - %define k2k3k6k7 [rsp + 16*1] - %define krd [rsp + 16*2] - pshuflw m6, m4, 0b ;k0_k1 - pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 - pshuflw m7, m4, 01010101b ;k2_k3 - pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 -%if CONFIG_PIC=0 - mova m1, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m1, m1 ;all ones - psrlw m1, 15 - psllw m1, 6 ;aka pw_64 -%endif - mova k0k1k4k5, m6 - mova k2k3k6k7, m7 - mova krd, m1 -%endif - dec heightd - -.loop: - ;Do two rows at once - movu m4, [srcq - 3] - movu m5, [srcq + sstrideq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - punpckhbw m3, m5, m5 - punpcklbw m5, m5 - palignr m0, m1, m4, 1 - pmaddubsw m0, k0k1k4k5 - palignr m1, m4, 5 - pmaddubsw m1, k2k3k6k7 - palignr m2, m3, m5, 1 - pmaddubsw m2, k0k1k4k5 - palignr m3, m5, 5 - pmaddubsw m3, k2k3k6k7 - punpckhqdq m4, m0, m2 - punpcklqdq m0, m2 - punpckhqdq m5, m1, m3 - punpcklqdq m1, m3 - paddsw m0, m4 - paddsw m1, m5 -%ifidn %1, h8_avg - movd m4, [dstq] - movd m5, [dstq + dstrideq] -%endif - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - movu m5, [srcq + sstrideq] - punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 - punpcklbw m4, m3 - paddsw m0, m4 -%endif - packuswb m0, m0 - psrldq m1, m0, 4 - -%ifidn %1, h8_avg - pavgb m0, m4 - pavgb m1, m5 -%endif - movd [dstq], m0 - movd [dstq + dstrideq], m1 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m4, [srcq - 3] - punpckhbw m1, m4, m4 - punpcklbw m4, m4 - palignr m0, m1, m4, 1 - palignr m1, m4, 5 - pmaddubsw m0, k0k1k4k5 - pmaddubsw m1, k2k3k6k7 - psrldq m2, m0, 8 - psrldq m3, m1, 8 - paddsw m0, m2 - paddsw m1, m3 - paddsw m0, m1 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - punpcklbw m4, m3 - paddsw m0, m4 -%endif - packuswb m0, m0 -%ifidn %1, h8_avg - movd m4, [dstq] - pavgb m0, m4 -%endif - movd [dstq], m0 -.done: - REP_RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER8 1 -cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - dec heightd - -.loop: - ;Do two rows at once - movu m0, [srcq - 3] - movu m4, [srcq + sstrideq - 3] - punpckhbw m1, m0, m0 - punpcklbw m0, m0 - palignr m5, m1, m0, 13 - pmaddubsw m5, k6k7 - palignr m2, m1, m0, 5 - palignr m3, m1, m0, 9 - palignr m1, m0, 1 - pmaddubsw m1, k0k1 - punpckhbw m6, m4, m4 - punpcklbw m4, m4 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - - palignr m7, m6, m4, 13 - palignr m0, m6, m4, 5 - pmaddubsw m7, k6k7 - paddsw m1, m3 - paddsw m2, m5 - paddsw m1, m2 -%ifidn %1, h8_avg - movh m2, [dstq] - movhps m2, [dstq + dstrideq] -%endif - palignr m5, m6, m4, 9 - palignr m6, m4, 1 - pmaddubsw m0, k2k3 - pmaddubsw m6, k0k1 - paddsw m1, krd - pmaddubsw m5, k4k5 - psraw m1, 7 - paddsw m0, m7 - paddsw m6, m5 - paddsw m6, m0 - paddsw m6, krd - psraw m6, 7 -%ifidn %1, h8_add_src - pxor m3, m3 - movu m4, [srcq] - movu m5, [srcq + sstrideq] - punpcklbw m4, m3 - punpcklbw m5, m3 - paddsw m1, m4 - paddsw m6, m5 -%endif - packuswb m1, m6 -%ifidn %1, h8_avg - pavgb m1, m2 -%endif - movh [dstq], m1 - movhps [dstq + dstrideq], m1 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m0, [srcq - 3] - punpckhbw m3, m0, m0 - punpcklbw m0, m0 - palignr m1, m3, m0, 1 - palignr m2, m3, m0, 5 - palignr m4, m3, m0, 13 - palignr m3, m0, 9 - pmaddubsw m1, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - pmaddubsw m4, k6k7 - paddsw m1, m3 - paddsw m4, m2 - paddsw m1, m4 - paddsw m1, krd - psraw m1, 7 -%ifidn %1, h8_add_src - pxor m6, m6 - movu m5, [srcq] - punpcklbw m5, m6 - paddsw m1, m5 -%endif - packuswb m1, m1 -%ifidn %1, h8_avg - movh m0, [dstq] - pavgb m1, m0 -%endif - movh [dstq], m1 -.done: - REP_RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER16 1 -cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -.loop: - prefetcht0 [srcq + 2 * sstrideq -3] - - movu m0, [srcq - 3] - movu m4, [srcq - 2] - pmaddubsw m0, k0k1 - pmaddubsw m4, k0k1 - movu m1, [srcq - 1] - movu m5, [srcq + 0] - pmaddubsw m1, k2k3 - pmaddubsw m5, k2k3 - movu m2, [srcq + 1] - movu m6, [srcq + 2] - pmaddubsw m2, k4k5 - pmaddubsw m6, k4k5 - movu m3, [srcq + 3] - movu m7, [srcq + 4] - pmaddubsw m3, k6k7 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m1, m3 - paddsw m0, m1 - paddsw m4, m6 - paddsw m5, m7 - paddsw m4, m5 - paddsw m0, krd - paddsw m4, krd - psraw m0, 7 - psraw m4, 7 -%ifidn %1, h8_add_src -%if ARCH_X86=1 && CONFIG_PIC=1 - pcmpeqb m2, m2 ;all ones - psrlw m2, 8 ;even_byte_mask -%else - mova m2, [GLOBAL(even_byte_mask)] -%endif - movu m5, [srcq] - mova m7, m5 - pand m5, m2 - psrlw m7, 8 - paddsw m0, m5 - paddsw m4, m7 -%endif - packuswb m0, m0 - packuswb m4, m4 - punpcklbw m0, m4 -%ifidn %1, h8_avg - pavgb m0, [dstq] -%endif - lea srcq, [srcq + sstrideq] - mova [dstq], m0 - lea dstq, [dstq + dstrideq] - dec heightd - jnz .loop - REP_RET -%endm - -INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER4 h8 - -;------------------------------------------------------------------------------- - -; TODO(Linfeng): Detect cpu type and choose the code with better performance. -%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 - -%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - %define NUM_GENERAL_REG_USED 9 -%else - %define NUM_GENERAL_REG_USED 6 -%endif - -%macro SUBPIX_VFILTER 2 -cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -%ifidn %2, 8 - %define movx movh -%else - %define movx movd -%endif - - dec heightd - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - mov src1q, srcq - add src1q, sstrideq - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -.loop: - ;Do two rows at once - movx m0, [srcq ] ;A - movx m1, [src1q ] ;B - punpcklbw m0, m1 ;A B - movx m2, [srcq + sstrideq * 2 ] ;C - pmaddubsw m0, k0k1 - mova m6, m2 - movx m3, [src1q + sstrideq * 2] ;D - punpcklbw m2, m3 ;C D - pmaddubsw m2, k2k3 - movx m4, [srcq + sstrideq * 4 ] ;E - mova m7, m4 - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m4, k4k5 - punpcklbw m1, m6 ;A B next iter - movx m6, [srcq + sstride6q ] ;G - punpcklbw m5, m6 ;E F next iter - punpcklbw m3, m7 ;C D next iter - pmaddubsw m5, k4k5 - movx m7, [src1q + sstride6q ] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m6, k6k7 - pmaddubsw m3, k2k3 - pmaddubsw m1, k0k1 - paddsw m0, m4 - paddsw m2, m6 - movx m6, [srcq + sstrideq * 8 ] ;H next iter - punpcklbw m7, m6 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - paddsw m1, m5 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [srcq] - punpcklbw m4, m6 - paddsw m0, m4 -%endif - packuswb m0, m0 - - paddsw m3, m7 - paddsw m1, m3 - paddsw m1, krd - psraw m1, 7 -%ifidn %1, v8_add_src - movu m4, [src1q] - punpcklbw m4, m6 - paddsw m1, m4 -%endif - lea srcq, [srcq + sstrideq * 2 ] - lea src1q, [src1q + sstrideq * 2] - packuswb m1, m1 - -%ifidn %1, v8_avg - movx m2, [dstq] - pavgb m0, m2 -%endif - movx [dstq], m0 - add dstq, dst_stride -%ifidn %1, v8_avg - movx m3, [dstq] - pavgb m1, m3 -%endif - movx [dstq], m1 - add dstq, dst_stride - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - movx m6, [srcq + sstride6q ] ;G - punpcklbw m0, m1 ;A B - movx m7, [src1q + sstride6q ] ;H - pmaddubsw m0, k0k1 - movx m2, [srcq + sstrideq * 2 ] ;C - punpcklbw m6, m7 ;G H - movx m3, [src1q + sstrideq * 2] ;D - pmaddubsw m6, k6k7 - movx m4, [srcq + sstrideq * 4 ] ;E - punpcklbw m2, m3 ;C D - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - paddsw m2, m6 - paddsw m0, m4 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [srcq] - punpcklbw m4, m6 - paddsw m0, m4 -%endif - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%else - ; ARCH_X86_64 - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2 ] - movx m2, [srcq] ;C - movx m3, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2 ] - movx m4, [srcq] ;E - movx m5, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2 ] - movx m6, [srcq] ;G - punpcklbw m0, m1 ;A B - punpcklbw m1, m2 ;A B next iter - punpcklbw m2, m3 ;C D - punpcklbw m3, m4 ;C D next iter - punpcklbw m4, m5 ;E F - punpcklbw m5, m6 ;E F next iter - -.loop: - ;Do two rows at once - movx m7, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2 ] - movx m14, [srcq] ;H next iter - punpcklbw m6, m7 ;G H - punpcklbw m7, m14 ;G H next iter - pmaddubsw m8, m0, k0k1 - pmaddubsw m9, m1, k0k1 - mova m0, m2 - mova m1, m3 - pmaddubsw m10, m2, k2k3 - pmaddubsw m11, m3, k2k3 - mova m2, m4 - mova m3, m5 - pmaddubsw m4, k4k5 - pmaddubsw m5, k4k5 - paddsw m8, m4 - paddsw m9, m5 - mova m4, m6 - mova m5, m7 - pmaddubsw m6, k6k7 - pmaddubsw m7, k6k7 - paddsw m10, m6 - paddsw m11, m7 - paddsw m8, m10 - paddsw m9, m11 - mova m6, m14 - paddsw m8, krd - paddsw m9, krd - psraw m8, 7 - psraw m9, 7 -%ifidn %2, 4 - packuswb m8, m8 - packuswb m9, m9 -%else - packuswb m8, m9 -%endif - -%ifidn %1, v8_avg - movx m7, [dstq] -%ifidn %2, 4 - movx m10, [dstq + dstrideq] - pavgb m9, m10 -%else - movhpd m7, [dstq + dstrideq] -%endif - pavgb m8, m7 -%endif - movx [dstq], m8 -%ifidn %2, 4 - movx [dstq + dstrideq], m9 -%else - movhpd [dstq + dstrideq], m8 -%endif - - lea dstq, [dstq + dstrideq * 2 ] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movx m7, [srcq + sstrideq] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - pmaddubsw m6, k6k7 - paddsw m0, m4 - paddsw m2, m6 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 - -%endif ; ARCH_X86_64 - -.done: - REP_RET - -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_VFILTER16 1 -cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS - -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON - -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - lea src1q, [srcq + sstrideq] - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -.loop: - movh m0, [srcq ] ;A - movh m1, [src1q ] ;B - movh m2, [srcq + sstrideq * 2 ] ;C - movh m3, [src1q + sstrideq * 2] ;D - movh m4, [srcq + sstrideq * 4 ] ;E - movh m5, [src1q + sstrideq * 4] ;F - - punpcklbw m0, m1 ;A B - movh m6, [srcq + sstride6q] ;G - punpcklbw m2, m3 ;C D - movh m7, [src1q + sstride6q] ;H - punpcklbw m4, m5 ;E F - pmaddubsw m0, k0k1 - movh m3, [srcq + 8] ;A - pmaddubsw m2, k2k3 - punpcklbw m6, m7 ;G H - movh m5, [srcq + sstrideq + 8] ;B - pmaddubsw m4, k4k5 - punpcklbw m3, m5 ;A B - movh m7, [srcq + sstrideq * 2 + 8] ;C - pmaddubsw m6, k6k7 - movh m5, [src1q + sstrideq * 2 + 8] ;D - punpcklbw m7, m5 ;C D - paddsw m2, m6 - pmaddubsw m3, k0k1 - movh m1, [srcq + sstrideq * 4 + 8] ;E - paddsw m0, m4 - pmaddubsw m7, k2k3 - movh m6, [src1q + sstrideq * 4 + 8] ;F - punpcklbw m1, m6 ;E F - paddsw m0, m2 - paddsw m0, krd - movh m2, [srcq + sstride6q + 8] ;G - pmaddubsw m1, k4k5 - movh m5, [src1q + sstride6q + 8] ;H - psraw m0, 7 - punpcklbw m2, m5 ;G H - pmaddubsw m2, k6k7 - paddsw m7, m2 - paddsw m3, m1 - paddsw m3, m7 - paddsw m3, krd - psraw m3, 7 -%ifidn %1, v8_add_src - pxor m6, m6 - movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down - mova m5, m4 - punpcklbw m4, m6 - punpckhbw m5, m6 - paddsw m0, m4 - paddsw m3, m5 -%endif - packuswb m0, m3 - - add srcq, sstrideq - add src1q, sstrideq -%ifidn %1, v8_avg - pavgb m0, [dstq] -%endif - mova [dstq], m0 - add dstq, dst_stride - dec heightd - jnz .loop - REP_RET - -%else - ; ARCH_X86_64 - dec heightd - - movu m1, [srcq ] ;A - movu m3, [srcq + sstrideq ] ;B - lea srcq, [srcq + sstrideq * 2] - punpcklbw m0, m1, m3 ;A B - punpckhbw m1, m3 ;A B - movu m5, [srcq] ;C - punpcklbw m2, m3, m5 ;A B next iter - punpckhbw m3, m5 ;A B next iter - mova tmp0, m2 ;store to stack - mova tmp1, m3 ;store to stack - movu m7, [srcq + sstrideq] ;D - lea srcq, [srcq + sstrideq * 2] - punpcklbw m4, m5, m7 ;C D - punpckhbw m5, m7 ;C D - movu m9, [srcq] ;E - punpcklbw m6, m7, m9 ;C D next iter - punpckhbw m7, m9 ;C D next iter - movu m11, [srcq + sstrideq] ;F - lea srcq, [srcq + sstrideq * 2] - punpcklbw m8, m9, m11 ;E F - punpckhbw m9, m11 ;E F - movu m2, [srcq] ;G - punpcklbw m10, m11, m2 ;E F next iter - punpckhbw m11, m2 ;E F next iter - -.loop: - ;Do two rows at once - pmaddubsw m13, m0, k0k1 - mova m0, m4 - pmaddubsw m14, m8, k4k5 - pmaddubsw m15, m4, k2k3 - mova m4, m8 - paddsw m13, m14 - movu m3, [srcq + sstrideq] ;H - lea srcq, [srcq + sstrideq * 2] - punpcklbw m14, m2, m3 ;G H - mova m8, m14 - pmaddubsw m14, k6k7 - paddsw m15, m14 - paddsw m13, m15 - paddsw m13, krd - psraw m13, 7 - - pmaddubsw m14, m1, k0k1 - pmaddubsw m1, m9, k4k5 - pmaddubsw m15, m5, k2k3 - paddsw m14, m1 - mova m1, m5 - mova m5, m9 - punpckhbw m2, m3 ;G H - mova m9, m2 - pmaddubsw m2, k6k7 - paddsw m15, m2 - paddsw m14, m15 - paddsw m14, krd - psraw m14, 7 - packuswb m13, m14 -%ifidn %1, v8_avg - pavgb m13, [dstq] -%endif - mova [dstq], m13 - - ; next iter - pmaddubsw m15, tmp0, k0k1 - pmaddubsw m14, m10, k4k5 - pmaddubsw m13, m6, k2k3 - paddsw m15, m14 - mova tmp0, m6 - mova m6, m10 - movu m2, [srcq] ;G next iter - punpcklbw m14, m3, m2 ;G H next iter - mova m10, m14 - pmaddubsw m14, k6k7 - paddsw m13, m14 - paddsw m15, m13 - paddsw m15, krd - psraw m15, 7 - - pmaddubsw m14, tmp1, k0k1 - mova tmp1, m7 - pmaddubsw m13, m7, k2k3 - mova m7, m11 - pmaddubsw m11, k4k5 - paddsw m14, m11 - punpckhbw m3, m2 ;G H next iter - mova m11, m3 - pmaddubsw m3, k6k7 - paddsw m13, m3 - paddsw m14, m13 - paddsw m14, krd - psraw m14, 7 - packuswb m15, m14 -%ifidn %1, v8_avg - pavgb m15, [dstq + dstrideq] -%endif - mova [dstq + dstrideq], m15 - lea dstq, [dstq + dstrideq * 2] - sub heightd, 2 - jg .loop - - ; Do last row if output_height is odd - jne .done - - movu m3, [srcq + sstrideq] ;H - punpcklbw m6, m2, m3 ;G H - punpckhbw m2, m3 ;G H - pmaddubsw m0, k0k1 - pmaddubsw m1, k0k1 - pmaddubsw m4, k2k3 - pmaddubsw m5, k2k3 - pmaddubsw m8, k4k5 - pmaddubsw m9, k4k5 - pmaddubsw m6, k6k7 - pmaddubsw m2, k6k7 - paddsw m0, m8 - paddsw m1, m9 - paddsw m4, m6 - paddsw m5, m2 - paddsw m0, m4 - paddsw m1, m5 - paddsw m0, krd - paddsw m1, krd - psraw m0, 7 - psraw m1, 7 - packuswb m0, m1 -%ifidn %1, v8_avg - pavgb m0, [dstq] -%endif - mova [dstq], m0 - -.done: - REP_RET - -%endif ; ARCH_X86_64 - -%endm - -INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8, 4 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm deleted file mode 100644 index d0b4b2839..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -global sym(aom_filter_block1d4_v2_sse2) PRIVATE -sym(aom_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_sse2) PRIVATE -sym(aom_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_sse2) PRIVATE -sym(aom_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d4_h2_sse2) PRIVATE -sym(aom_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_sse2) PRIVATE -sym(aom_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_sse2) PRIVATE -sym(aom_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm deleted file mode 100644 index 59edc49a9..000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm +++ /dev/null @@ -1,267 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov ecx, 0x01000100 - - movdqa xmm3, [rdx] ;load filters - psrldq xmm3, 6 - packsswb xmm3, xmm3 - pshuflw xmm3, xmm3, 0b ;k3_k4 - - movd xmm2, ecx ;rounding_shift - pshufd xmm2, xmm2, 0 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - punpcklbw xmm0, xmm1 - pmaddubsw xmm0, xmm3 - - pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov ecx, 0x01000100 - - movdqa xmm7, [rdx] ;load filters - psrldq xmm7, 6 - packsswb xmm7, xmm7 - pshuflw xmm7, xmm7, 0b ;k3_k4 - punpcklwd xmm7, xmm7 - - movd xmm6, ecx ;rounding_shift - pshufd xmm6, xmm6, 0 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm1 - pmaddubsw xmm0, xmm7 - - pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) - packuswb xmm0, xmm0 ;pack back to byte - -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, xmm7 - pmaddubsw xmm2, xmm7 - - pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) - pmulhrsw xmm2, xmm6 - packuswb xmm0, xmm2 ;pack back to byte - -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -global sym(aom_filter_block1d4_v2_ssse3) PRIVATE -sym(aom_filter_block1d4_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_ssse3) PRIVATE -sym(aom_filter_block1d8_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_ssse3) PRIVATE -sym(aom_filter_block1d16_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d4_h2_ssse3) PRIVATE -sym(aom_filter_block1d4_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_ssse3) PRIVATE -sym(aom_filter_block1d8_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_ssse3) PRIVATE -sym(aom_filter_block1d16_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c deleted file mode 100644 index 4f5e3f8c1..000000000 --- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom/aom_integer.h" - -#include "config/aom_dsp_rtcd.h" - -// To start out, just dispatch to the function using the 2D mask and -// pass mask stride as 0. This can be improved upon if necessary. - -void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, 0, w, h, 0, 0); -} - -void aom_highbd_blend_a64_hmask_sse4_1( - uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, - uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int w, int h, int bd) { - aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, - src1_8, src1_stride, mask, 0, w, h, 0, 0, - bd); -} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c deleted file mode 100644 index 67fb4d32b..000000000 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c +++ /dev/null @@ -1,900 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> // SSE4.1 -#include <immintrin.h> // AVX2 - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" - -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" -#include "aom_dsp/x86/blend_sse4.h" -#include "aom_dsp/x86/blend_mask_sse4.h" - -#include "config/aom_dsp_rtcd.h" - -static INLINE void blend_a64_d16_mask_w16_avx2( - uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, - const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, - int shift) { - const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); - const __m256i s0_0 = yy_loadu_256(src0); - const __m256i s1_0 = yy_loadu_256(src1); - __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), - _mm256_unpacklo_epi16(*m0, max_minus_m0)); - __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), - _mm256_unpackhi_epi16(*m0, max_minus_m0)); - res0_lo = - _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); - res0_hi = - _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); - const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); - __m256i res = _mm256_packus_epi16(res0, res0); - res = _mm256_permute4x64_epi64(res, 0xd8); - _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); -} - -static INLINE void blend_a64_d16_mask_w32_avx2( - uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, - const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, - const __m256i *v_maxval, int shift) { - const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); - const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); - const __m256i s0_0 = yy_loadu_256(src0); - const __m256i s0_1 = yy_loadu_256(src0 + 16); - const __m256i s1_0 = yy_loadu_256(src1); - const __m256i s1_1 = yy_loadu_256(src1 + 16); - __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), - _mm256_unpacklo_epi16(*m0, max_minus_m0)); - __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), - _mm256_unpackhi_epi16(*m0, max_minus_m0)); - __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), - _mm256_unpacklo_epi16(*m1, max_minus_m1)); - __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), - _mm256_unpackhi_epi16(*m1, max_minus_m1)); - res0_lo = - _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); - res0_hi = - _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); - res1_lo = - _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); - res1_hi = - _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); - const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); - const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); - __m256i res = _mm256_packus_epi16(res0, res1); - res = _mm256_permute4x64_epi64(res, 0xd8); - _mm256_storeu_si256((__m256i *)(dst), res); -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - for (int i = 0; i < h; ++i) { - const __m128i m = xx_loadu_128(mask); - const __m256i m0 = _mm256_cvtepu8_epi16(m); - - blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, - shift); - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 32) { - const __m256i m = yy_loadu_256(mask + j); - const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); - const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); - - blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m256i one_b = _mm256_set1_epi8(1); - const __m256i two_w = _mm256_set1_epi16(2); - for (int i = 0; i < h; ++i) { - const __m256i m_i00 = yy_loadu_256(mask); - const __m256i m_i10 = yy_loadu_256(mask + mask_stride); - - const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); - const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); - const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); - - blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, - shift); - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m256i one_b = _mm256_set1_epi8(1); - const __m256i two_w = _mm256_set1_epi16(2); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 32) { - const __m256i m_i00 = yy_loadu_256(mask + 2 * j); - const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); - const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); - const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); - - const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); - const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); - const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); - const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); - const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); - const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); - - blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m256i one_b = _mm256_set1_epi8(1); - const __m256i zeros = _mm256_setzero_si256(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m256i m_i00 = yy_loadu_256(mask + 2 * j); - const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); - const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); - - blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, - round_offset, &v_maxval, shift); - } - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m256i one_b = _mm256_set1_epi8(1); - const __m256i zeros = _mm256_setzero_si256(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 32) { - const __m256i m_i00 = yy_loadu_256(mask + 2 * j); - const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); - const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); - const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); - const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); - const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); - - blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m128i m_i00 = xx_loadu_128(mask + j); - const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); - - const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); - const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); - - blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, - round_offset, &v_maxval, shift); - } - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m256i *round_offset, int shift) { - const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m256i zeros = _mm256_setzero_si256(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 32) { - const __m256i m_i00 = yy_loadu_256(mask + j); - const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); - - const __m256i m_ac = - _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); - const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); - const __m256i m1 = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); - - blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -void aom_lowbd_blend_a64_d16_mask_avx2( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, - ConvolveParams *conv_params) { - const int bd = 8; - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - - const int round_offset = - ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - - (1 << (round_bits - 1))) - << AOM_BLEND_A64_ROUND_BITS; - - const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; - assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - const __m128i v_round_offset = _mm_set1_epi32(round_offset); - const __m256i y_round_offset = _mm256_set1_epi32(round_offset); - - if (subw == 0 && subh == 0) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 16: - lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &y_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - } - } else if (subw == 1 && subh == 1) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 16: - lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &y_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - } - } else if (subw == 1 && subh == 0) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 16: - lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - } - } else { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 16: - lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &y_round_offset, shift); - break; - } - } -} - -static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, - const __m256i *v_m0_b, - const __m256i *v_m1_b, - const int32_t bits) { - const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); - const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); - const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); - const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); - - const __m256i v_p0_w = - _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), - _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); - - const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); - const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); - const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); - return v_res; -} - -static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, - const __m256i *v_m0_b, - const __m256i *v_m1_b, - const int32_t bits) { - const __m256i v_s0_b = yy_loadu_256(src0); - const __m256i v_s1_b = yy_loadu_256(src1); - - const __m256i v_p0_w = - _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), - _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); - const __m256i v_p1_w = - _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), - _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); - - const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); - const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); - const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); - return v_res; -} - -static INLINE void blend_a64_mask_sx_sy_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h) { - const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - do { - const __m256i v_ral_b = yy_loadu_256(mask); - const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); - const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); - const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); - const __m256i v_rvsbl_w = - _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); - const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); - - const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); - const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - - const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, - AOM_BLEND_A64_ROUND_BITS); - - xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sx_sy_w32n_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); - do { - int c; - for (c = 0; c < w; c += 32) { - const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); - const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); - const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); - const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); - const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); - const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); - const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); - const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); - const __m256i v_rvsbl_w = - _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); - const __m256i v_rvsbh_w = - _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); - const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); - const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); - - const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); - const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); - const __m256i v_m0_b = - _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - - const __m256i v_res_b = blend_32_u8_avx2( - src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); - - yy_storeu_256(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sx_sy_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - switch (w) { - case 4: - do { - const __m128i v_ra_b = xx_loadl_64(mask); - const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); - const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); - const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); - const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); - break; - case 8: - do { - const __m128i v_ra_b = xx_loadu_128(mask); - const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); - const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); - const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); - const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); - break; - case 16: - blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h); - break; - default: - blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - break; - } -} - -static INLINE void blend_a64_mask_sx_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h) { - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m256i v_zmask_b = _mm256_set1_epi16(0xff); - do { - const __m256i v_rl_b = yy_loadu_256(mask); - const __m256i v_al_b = - _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); - - const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); - const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - - const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, - AOM_BLEND_A64_ROUND_BITS); - - xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sx_w32n_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - do { - int c; - for (c = 0; c < w; c += 32) { - const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); - const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); - const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); - const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); - const __m256i v_al_b = - _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); - const __m256i v_ah_b = - _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); - - const __m256i v_m0_b = - _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - - const __m256i v_res_b = blend_32_u8_avx2( - src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); - - yy_storeu_256(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sx_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - switch (w) { - case 4: - do { - const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); - const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); - break; - case 8: - do { - const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); - const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); - break; - case 16: - blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h); - break; - default: - blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - break; - } -} - -static INLINE void blend_a64_mask_sy_w16_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h) { - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - do { - const __m128i v_ra_b = xx_loadu_128(mask); - const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst, v_res_b); - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sy_w32n_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - do { - int c; - for (c = 0; c < w; c += 32) { - const __m256i v_ra_b = yy_loadu_256(mask + c); - const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); - const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - const __m256i v_res_b = blend_32_u8_avx2( - src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); - - yy_storeu_256(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_sy_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - switch (w) { - case 4: - do { - const __m128i v_ra_b = xx_loadl_32(mask); - const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); - break; - case 8: - do { - const __m128i v_ra_b = xx_loadl_64(mask); - const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); - break; - case 16: - blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h); - break; - default: - blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - } -} - -static INLINE void blend_a64_mask_w32n_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - do { - int c; - for (c = 0; c < w; c += 32) { - const __m256i v_m0_b = yy_loadu_256(mask + c); - const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); - - const __m256i v_res_b = blend_32_u8_avx2( - src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); - - yy_storeu_256(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static INLINE void blend_a64_mask_avx2( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - switch (w) { - case 4: - do { - const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); - break; - case 8: - do { - const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); - break; - case 16: - do { - const __m128i v_m0_b = xx_loadu_128(mask); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst, v_res_b); - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); - break; - default: - blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - } -} - -void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); - } else { - if (subx & suby) { - blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - } else if (subx) { - blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - } else if (suby) { - blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h); - } else { - blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h); - } - } -} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c deleted file mode 100644 index 9d6b4c2f7..000000000 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ /dev/null @@ -1,1109 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> // SSE4.1 - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" - -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/blend_sse4.h" -#include "aom_dsp/x86/blend_mask_sse4.h" - -#include "config/aom_dsp_rtcd.h" - -////////////////////////////////////////////////////////////////////////////// -// No sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int w, int h) { - (void)w; - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int w, int h) { - (void)w; - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - - do { - int c; - for (c = 0; c < w; c += 16) { - const __m128i v_m0_b = xx_loadu_128(mask + c); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = - blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -////////////////////////////////////////////////////////////////////////////// -// Horizontal sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static void blend_a64_mask_sx_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); - const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_sx_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); - const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); - const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_sx_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - - do { - int c; - for (c = 0; c < w; c += 16) { - const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); - const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); - const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); - const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); - const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); - const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = - blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -////////////////////////////////////////////////////////////////////////////// -// Vertical sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static void blend_a64_mask_sy_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - - do { - const __m128i v_ra_b = xx_loadl_32(mask); - const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_sy_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - const __m128i v_ra_b = xx_loadl_64(mask); - const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_sy_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - int c; - for (c = 0; c < w; c += 16) { - const __m128i v_ra_b = xx_loadu_128(mask + c); - const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); - const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = - blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -////////////////////////////////////////////////////////////////////////////// -// Horizontal and Vertical sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static void blend_a64_mask_sx_sy_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - (void)w; - - do { - const __m128i v_ra_b = xx_loadl_64(mask); - const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); - const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); - const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); - const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_sx_sy_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - (void)w; - - do { - const __m128i v_ra_b = xx_loadu_128(mask); - const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); - const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); - const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); - const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_sx_sy_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - int c; - for (c = 0; c < w; c += 16) { - const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); - const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); - const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); - const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); - const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); - const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); - const __m128i v_rvsbl_w = - _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); - const __m128i v_rvsbh_w = - _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); - const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); - const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); - - const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); - const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); - const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); - const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - - const __m128i v_res_b = - blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); - - xx_storeu_128(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -////////////////////////////////////////////////////////////////////////////// -// Dispatch -////////////////////////////////////////////////////////////////////////////// - -void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, - int h, int subx, int suby) { - typedef void (*blend_fn)( - uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, - uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h); - - // Dimensions are: width_index X subx X suby - static const blend_fn blend[3][2][2] = { - { // w % 16 == 0 - { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, - { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, - { // w == 4 - { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, - { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, - { // w == 8 - { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, - { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } - }; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, w, h, subx, suby); - } else { - blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, - src0_stride, src1, src1_stride, - mask, mask_stride, w, h); - } -} - -////////////////////////////////////////////////////////////////////////////// -// No sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static INLINE void blend_a64_mask_bn_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); - - xx_storel_64(dst, v_res_w); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, blend_4_b10); -} - -static void blend_a64_mask_b12_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, blend_4_b12); -} - -static INLINE void blend_a64_mask_bn_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, - blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - for (c = 0; c < w; c += 8) { - const __m128i v_m0_b = xx_loadl_64(mask + c); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); - - xx_storeu_128(dst + c, v_res_w); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b10); -} - -static void blend_a64_mask_b12_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b12); -} - -////////////////////////////////////////////////////////////////////////////// -// Horizontal sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); - - xx_storel_64(dst, v_res_w); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); -} - -static void blend_a64_mask_b12_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); -} - -static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, - blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - for (c = 0; c < w; c += 8) { - const __m128i v_r_b = xx_loadu_128(mask + 2 * c); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); - - xx_storeu_128(dst + c, v_res_w); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b10); -} - -static void blend_a64_mask_b12_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b12); -} - -////////////////////////////////////////////////////////////////////////////// -// Vertical sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - const __m128i v_ra_b = xx_loadl_32(mask); - const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); - - xx_storel_64(dst, v_res_w); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); -} - -static void blend_a64_mask_b12_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); -} - -static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, - blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - for (c = 0; c < w; c += 8) { - const __m128i v_ra_b = xx_loadl_64(mask + c); - const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); - - xx_storeu_128(dst + c, v_res_w); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b10); -} - -static void blend_a64_mask_b12_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b12); -} - -////////////////////////////////////////////////////////////////////////////// -// Horizontal and Vertical sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - const __m128i v_ra_b = xx_loadl_64(mask); - const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); - - xx_storel_64(dst, v_res_w); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); -} - -static void blend_a64_mask_b12_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - (void)w; - blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); -} - -static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, - blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - for (c = 0; c < w; c += 8) { - const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - - const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); - - xx_storeu_128(dst + c, v_res_w); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 2 * mask_stride; - } while (--h); -} - -static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b10); -} - -static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h) { - blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, w, h, - blend_8_b12); -} - -////////////////////////////////////////////////////////////////////////////// -// Dispatch -////////////////////////////////////////////////////////////////////////////// - -void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, - uint32_t src0_stride, - const uint8_t *src1_8, - uint32_t src1_stride, const uint8_t *mask, - uint32_t mask_stride, int w, int h, - int subx, int suby, int bd) { - typedef void (*blend_fn)( - uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h); - - // Dimensions are: bd_index X width_index X subx X suby - static const blend_fn blend[2][2][2][2] = { - { // bd == 8 or 10 - { // w % 8 == 0 - { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, - { blend_a64_mask_b10_sx_w8n_sse4_1, - blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, - { // w == 4 - { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, - { blend_a64_mask_b10_sx_w4_sse4_1, - blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, - { // bd == 12 - { // w % 8 == 0 - { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, - { blend_a64_mask_b12_sx_w8n_sse4_1, - blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, - { // w == 4 - { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, - { blend_a64_mask_b12_sx_w4_sse4_1, - blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } - }; - - assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); - assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, mask_stride, w, h, subx, - suby, bd); - } else { - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); - const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); - const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - - blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, w, h); - } -} - -static INLINE void blend_a64_d16_mask_w16_sse41( - uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, - const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, - const __m128i *v_maxval, int shift) { - const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); - const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); - const __m128i s0_0 = xx_loadu_128(src0); - const __m128i s0_1 = xx_loadu_128(src0 + 8); - const __m128i s1_0 = xx_loadu_128(src1); - const __m128i s1_1 = xx_loadu_128(src1 + 8); - __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), - _mm_unpacklo_epi16(*m0, max_minus_m0)); - __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), - _mm_unpackhi_epi16(*m0, max_minus_m0)); - __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), - _mm_unpacklo_epi16(*m1, max_minus_m1)); - __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), - _mm_unpackhi_epi16(*m1, max_minus_m1)); - res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); - res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); - res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); - res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); - const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); - const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); - const __m128i res = _mm_packus_epi16(res0, res1); - - _mm_storeu_si128((__m128i *)(dst), res); -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m128i m = xx_loadu_128(mask + j); - const __m128i m0 = _mm_cvtepu8_epi16(m); - const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); - - blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m128i m_i00 = xx_loadu_128(mask + 2 * j); - const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); - const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); - const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); - - const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); - const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); - const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); - const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); - const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); - const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); - - blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m128i m_i00 = xx_loadu_128(mask + 2 * j); - const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); - const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); - const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); - const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); - const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); - - blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const __m128i m_i00 = xx_loadu_128(mask + j); - const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); - - const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); - const __m128i m0 = _mm_cvtepu8_epi16(m_ac); - const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); - - blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, - round_offset, &v_maxval, shift); - } - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -void aom_lowbd_blend_a64_d16_mask_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, - ConvolveParams *conv_params) { - const int bd = 8; - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - - const int round_offset = - ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - - (1 << (round_bits - 1))) - << AOM_BLEND_A64_ROUND_BITS; - - const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; - assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); - - assert(h >= 4); - assert(w >= 4); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - const __m128i v_round_offset = _mm_set1_epi32(round_offset); - - if (subw == 0 && subh == 0) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &v_round_offset, shift); - break; - } - - } else if (subw == 1 && subh == 1) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &v_round_offset, shift); - break; - } - } else if (subw == 1 && subh == 0) { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &v_round_offset, shift); - break; - } - } else { - switch (w) { - case 4: - aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - case 8: - aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, &v_round_offset, shift); - break; - default: - lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( - dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w, &v_round_offset, shift); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c deleted file mode 100644 index 064910232..000000000 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> // SSE4.1 - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" - -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/blend_sse4.h" - -#include "config/aom_dsp_rtcd.h" - -////////////////////////////////////////////////////////////////////////////// -// Implementation - No sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - (void)w; - - do { - const __m128i v_m0_w = _mm_set1_epi16(*mask); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - - xx_storel_32(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 1; - } while (--h); -} - -static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - (void)w; - - do { - const __m128i v_m0_w = _mm_set1_epi16(*mask); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - - xx_storel_64(dst, v_res_b); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 1; - } while (--h); -} - -static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, - uint32_t src0_stride, - const uint8_t *src1, - uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - const __m128i v_m0_w = _mm_set1_epi16(*mask); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); - - xx_storeu_128(dst + c, v_res_b); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 1; - } while (--h); -} - -////////////////////////////////////////////////////////////////////////////// -// Dispatch -////////////////////////////////////////////////////////////////////////////// - -void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h); - - // Dimension: width_index - static const blend_fn blend[9] = { - blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 - aom_blend_a64_vmask_c, // w == 1 - aom_blend_a64_vmask_c, // w == 2 - NULL, // INVALID - blend_a64_vmask_w4_sse4_1, // w == 4 - NULL, // INVALID - NULL, // INVALID - NULL, // INVALID - blend_a64_vmask_w8_sse4_1, // w == 8 - }; - - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, - h); -} - -////////////////////////////////////////////////////////////////////////////// -// Implementation - No sub-sampling -////////////////////////////////////////////////////////////////////////////// - -static INLINE void blend_a64_vmask_bn_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - const __m128i v_m0_w = _mm_set1_epi16(*mask); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); - - xx_storel_64(dst, v_res_w); - - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 1; - } while (--h); -} - -static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, - uint32_t src0_stride, - const uint16_t *src1, - uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - (void)w; - blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, blend_4_b10); -} - -static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, - uint32_t src0_stride, - const uint16_t *src1, - uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - (void)w; - blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, blend_4_b12); -} - -static INLINE void blend_a64_vmask_bn_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, - uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h, blend_unit_fn blend) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - - do { - int c; - const __m128i v_m0_w = _mm_set1_epi16(*mask); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - for (c = 0; c < w; c += 8) { - const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); - - xx_storeu_128(dst + c, v_res_w); - } - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - mask += 1; - } while (--h); -} - -static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, - uint32_t src0_stride, - const uint16_t *src1, - uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, w, h, blend_8_b10); -} - -static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, - uint32_t src0_stride, - const uint16_t *src1, - uint32_t src1_stride, - const uint8_t *mask, int w, int h) { - blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, w, h, blend_8_b12); -} - -////////////////////////////////////////////////////////////////////////////// -// Dispatch -////////////////////////////////////////////////////////////////////////////// - -void aom_highbd_blend_a64_vmask_sse4_1( - uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, - uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int w, int h, int bd) { - typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int w, int h); - - // Dimensions are: bd_index X width_index - static const blend_fn blend[2][2] = { - { - // bd == 8 or 10 - blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 - blend_a64_vmask_b10_w4_sse4_1, // w == 4 - }, - { - // bd == 12 - blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 - blend_a64_vmask_b12_w4_sse4_1, // w == 4 - } - }; - - assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); - assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); - - assert(h >= 1); - assert(w >= 1); - assert(IS_POWER_OF_TWO(h)); - assert(IS_POWER_OF_TWO(w)); - - assert(bd == 8 || bd == 10 || bd == 12); - - if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, w, h, bd); - } else { - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); - const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); - const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - - blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, w, h); - } -} diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h deleted file mode 100644 index c071fdcfc..000000000 --- a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ -#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ -#include <smmintrin.h> // SSE4.1 - -#include <assert.h> - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/blend.h" - -#include "aom_dsp/x86/synonyms.h" - -#include "config/aom_dsp_rtcd.h" - -static INLINE void blend_a64_d16_mask_w4_sse41( - uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, - const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, - int shift) { - const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); - const __m128i s0 = xx_loadl_64(src0); - const __m128i s1 = xx_loadl_64(src1); - const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); - const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); - const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); - const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); - const __m128i res_d = _mm_srai_epi32(res_c, shift); - const __m128i res_e = _mm_packs_epi32(res_d, res_d); - const __m128i res = _mm_packus_epi16(res_e, res_e); - - xx_storel_32(dst, res); -} - -static INLINE void blend_a64_d16_mask_w8_sse41( - uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, - const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, - int shift) { - const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); - const __m128i s0 = xx_loadu_128(src0); - const __m128i s1 = xx_loadu_128(src1); - __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), - _mm_unpacklo_epi16(*m, max_minus_m)); - __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), - _mm_unpackhi_epi16(*m, max_minus_m)); - res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); - res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); - const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); - const __m128i res = _mm_packus_epi16(res_e, res_e); - - _mm_storel_epi64((__m128i *)(dst), res); -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - for (int i = 0; i < h; ++i) { - const __m128i m0 = xx_loadl_32(mask); - const __m128i m = _mm_cvtepu8_epi16(m0); - - blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - for (int i = 0; i < h; ++i) { - const __m128i m0 = xx_loadl_64(mask); - const __m128i m = _mm_cvtepu8_epi16(m0); - blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadl_64(mask); - const __m128i m_i1 = xx_loadl_64(mask + mask_stride); - const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); - const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); - const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); - const __m128i m = _mm_srli_epi16(m_acbd_2, 2); - - blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadu_128(mask); - const __m128i m_i1 = xx_loadu_128(mask + mask_stride); - const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); - const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); - const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); - const __m128i m = _mm_srli_epi16(m_acbd_2, 2); - - blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadl_64(mask); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m = _mm_avg_epu16(m_ac, zeros); - - blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadu_128(mask); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m = _mm_avg_epu16(m_ac, zeros); - - blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} -static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadl_64(mask); - const __m128i m_i1 = xx_loadl_64(mask + mask_stride); - const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); - const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); - - blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} - -static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, - uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - const __m128i *round_offset, int shift) { - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i zeros = _mm_setzero_si128(); - for (int i = 0; i < h; ++i) { - const __m128i m_i0 = xx_loadl_64(mask); - const __m128i m_i1 = xx_loadl_64(mask + mask_stride); - const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); - const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); - - blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, - shift); - mask += mask_stride << 1; - dst += dst_stride; - src0 += src0_stride; - src1 += src1_stride; - } -} -#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h deleted file mode 100644 index 8d9b32510..000000000 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ -#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ - -#include "aom_dsp/blend.h" -#include "aom_dsp/x86/synonyms.h" -static const uint8_t g_blend_a64_mask_shuffle[32] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, -}; - -////////////////////////////////////////////////////////////////////////////// -// Common kernels -////////////////////////////////////////////////////////////////////////////// - -static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, - const __m128i *v_m0_w, const __m128i *v_m1_w) { - const __m128i v_s0_b = xx_loadl_32(src0); - const __m128i v_s1_b = xx_loadl_32(src1); - const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); - const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); - - return v_res_w; -} - -static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, - const __m128i *v_m0_w, const __m128i *v_m1_w) { - const __m128i v_s0_b = xx_loadl_64(src0); - const __m128i v_s1_b = xx_loadl_64(src1); - const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); - const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); - - return v_res_w; -} - -static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, - const __m128i *v_m0_b, const __m128i *v_m1_b, - const __m128i *rounding) { - const __m128i v_s0_b = xx_loadl_32(src0); - const __m128i v_s1_b = xx_loadl_32(src1); - - const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), - _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); - - const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); - const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); - return v_res; -} - -static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, - const __m128i *v_m0_b, const __m128i *v_m1_b, - const __m128i *rounding) { - const __m128i v_s0_b = xx_loadl_64(src0); - const __m128i v_s1_b = xx_loadl_64(src1); - - const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), - _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); - - const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); - const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); - return v_res; -} - -static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, - const __m128i *v_m0_b, const __m128i *v_m1_b, - const __m128i *rounding) { - const __m128i v_s0_b = xx_loadu_128(src0); - const __m128i v_s1_b = xx_loadu_128(src1); - - const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), - _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); - const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), - _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); - - const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); - const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); - const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); - return v_res; -} - -typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w); - -static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadl_64(src0); - const __m128i v_s1_w = xx_loadl_64(src1); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); - - return v_res_w; -} - -static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadu_128(src0); - const __m128i v_s1_w = xx_loadu_128(src1); - - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - - const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); - - return v_res_w; -} - -static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadl_64(src0); - const __m128i v_s1_w = xx_loadl_64(src1); - - // Interleave - const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); - const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); - - // Multiply-Add - const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); - - // Scale - const __m128i v_ssum_d = - _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); - - // Pack - const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); - - // Round - const __m128i v_res_w = xx_round_epu16(v_pssum_d); - - return v_res_w; -} - -static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { - const __m128i v_s0_w = xx_loadu_128(src0); - const __m128i v_s1_w = xx_loadu_128(src1); - - // Interleave - const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); - const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); - const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); - const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); - - // Multiply-Add - const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); - const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); - - // Scale - const __m128i v_ssuml_d = - _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); - const __m128i v_ssumh_d = - _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); - - // Pack - const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); - - // Round - const __m128i v_res_w = xx_round_epu16(v_pssum_d); - - return v_res_w; -} - -#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h deleted file mode 100644 index 96fe4ebb6..000000000 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ -#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ - -#include <immintrin.h> - -#include "config/aom_config.h" - -// Note: in and out could have the same value -static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { - __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); - __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); - __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); - __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); - __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); - - __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); - __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); - __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); - __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); - __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); - __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); - __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); - __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); - - // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b - // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f - // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b - // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f - // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b - // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f - // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b - // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f - - // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b - // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f - // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb - // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf - // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db - // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df - // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb - // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff - - __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); - __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); - __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); - __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); - __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); - __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); - __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); - __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); - - __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); - __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); - __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); - __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); - __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); - __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); - __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); - __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); - - // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 - // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b - // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d - // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f - // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 - // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b - // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d - // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f - - // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 - // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb - // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd - // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf - // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 - // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb - // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd - // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff - - tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - - tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); - tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); - tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); - tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); - tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); - tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); - tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); - tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); - - // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 - // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 - // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a - // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b - // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c - // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d - // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e - // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f - - // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 - // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 - // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa - // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb - // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc - // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd - // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe - // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - - out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); -} -#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h deleted file mode 100644 index 3e19682cd..000000000 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ -#define AOM_AOM_DSP_X86_CONVOLVE_H_ - -#include <assert.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, const int16_t *filter); - -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void aom_convolve8_##name##_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - (void)filter_x; \ - (void)x_step_q4; \ - (void)filter_y; \ - (void)y_step_q4; \ - assert((-128 <= filter[3]) && (filter[3] <= 127)); \ - assert(step_q4 == 16); \ - if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ - (filter[2] | filter[5])) { \ - while (w >= 16) { \ - aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - if (w) { \ - aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ - } - -typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, int bd); - -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void aom_highbd_convolve8_##name##_##opt( \ - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - aom_highbd_convolve8_##name##_c( \ - CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ - dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } \ - } - -#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h deleted file mode 100644 index 30253f65c..000000000 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ -#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ - -// filters for 16 -DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, - 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, - 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, - 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, - 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, - 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { - 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, - 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, - 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { - 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, - 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, -}; - -static INLINE void prepare_coeffs_lowbd( - const InterpFilterParams *const filter_params, const int subpel_q4, - __m256i *const coeffs /* [4] */) { - const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_q4 & SUBPEL_MASK); - const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); - const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); - - // right shift all filter co-efficients by 1 to reduce the bits required. - // This extra right shift will be taken care of at the end while rounding - // the result. - // Since all filter co-efficients are even, this change will not affect the - // end result - assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), - _mm_set1_epi16(0xffff))); - - const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); - - // coeffs 0 1 0 1 0 1 0 1 - coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); - // coeffs 2 3 2 3 2 3 2 3 - coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); - // coeffs 4 5 4 5 4 5 4 5 - coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); - // coeffs 6 7 6 7 6 7 6 7 - coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); -} - -static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, - const int subpel_q4, - __m256i *const coeffs /* [4] */) { - const int16_t *filter = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_q4 & SUBPEL_MASK); - - const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); - const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); - - // coeffs 0 1 0 1 0 1 0 1 - coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); - // coeffs 2 3 2 3 2 3 2 3 - coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); - // coeffs 4 5 4 5 4 5 4 5 - coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); - // coeffs 6 7 6 7 6 7 6 7 - coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); -} - -static INLINE __m256i convolve_lowbd(const __m256i *const s, - const __m256i *const coeffs) { - const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); - const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); - const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); - const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); - - // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), - _mm256_add_epi16(res_23, res_67)); - - return res; -} - -static INLINE __m256i convolve(const __m256i *const s, - const __m256i *const coeffs) { - const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); - const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); - const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); - const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); - - const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), - _mm256_add_epi32(res_2, res_3)); - - return res; -} - -static INLINE __m256i convolve_lowbd_x(const __m256i data, - const __m256i *const coeffs, - const __m256i *const filt) { - __m256i s[4]; - - s[0] = _mm256_shuffle_epi8(data, filt[0]); - s[1] = _mm256_shuffle_epi8(data, filt[1]); - s[2] = _mm256_shuffle_epi8(data, filt[2]); - s[3] = _mm256_shuffle_epi8(data, filt[3]); - - return convolve_lowbd(s, coeffs); -} - -static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, - const __m256i *const res, - const int do_average) { - __m256i d; - if (do_average) { - d = _mm256_load_si256((__m256i *)dst); - d = _mm256_add_epi32(d, *res); - d = _mm256_srai_epi32(d, 1); - } else { - d = *res; - } - _mm256_store_si256((__m256i *)dst, d); -} - -static INLINE __m256i comp_avg(const __m256i *const data_ref_0, - const __m256i *const res_unsigned, - const __m256i *const wt, - const int use_jnt_comp_avg) { - __m256i res; - if (use_jnt_comp_avg) { - const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); - const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); - - const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); - const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); - - const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); - - res = _mm256_packs_epi32(res_lo, res_hi); - } else { - const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); - res = _mm256_srai_epi16(wt_res, 1); - } - return res; -} - -static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, - const __m256i *const offset_const, - const __m256i *const round_const, - const int round_shift) { - const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); - const __m256i res_round = _mm256_srai_epi16( - _mm256_add_epi16(res_signed, *round_const), round_shift); - return res_round; -} - -static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, - const __m256i *const res_unsigned, - const __m256i *const wt0, - const __m256i *const wt1, - const int use_jnt_comp_avg) { - __m256i res; - if (use_jnt_comp_avg) { - const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); - const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); - const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); - res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); - } else { - const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); - res = _mm256_srai_epi32(wt_res, 1); - } - return res; -} - -static INLINE __m256i highbd_convolve_rounding( - const __m256i *const res_unsigned, const __m256i *const offset_const, - const __m256i *const round_const, const int round_shift) { - const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); - const __m256i res_round = _mm256_srai_epi32( - _mm256_add_epi32(res_signed, *round_const), round_shift); - - return res_round; -} - -#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h deleted file mode 100644 index 707bd2d78..000000000 --- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ -#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ - -// Note: -// This header file should be put below any x86 intrinsics head file - -static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, - const int do_average) { - __m128i d; - if (do_average) { - d = _mm_load_si128((__m128i *)dst); - d = _mm_add_epi32(d, *res); - d = _mm_srai_epi32(d, 1); - } else { - d = *res; - } - _mm_store_si128((__m128i *)dst, d); -} - -#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h deleted file mode 100644 index 445d04b10..000000000 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ -#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ - -// Note: -// This header file should be put below any x86 intrinsics head file - -static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, - const int subpel_q4, - __m128i *const coeffs /* [4] */) { - const int16_t *filter = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_q4 & SUBPEL_MASK); - const __m128i coeff = _mm_loadu_si128((__m128i *)filter); - - // coeffs 0 1 0 1 0 1 0 1 - coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); - // coeffs 2 3 2 3 2 3 2 3 - coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); - // coeffs 4 5 4 5 4 5 4 5 - coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); - // coeffs 6 7 6 7 6 7 6 7 - coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); -} - -static INLINE __m128i convolve(const __m128i *const s, - const __m128i *const coeffs) { - const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); - const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); - const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); - const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); - - const __m128i res = - _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); - - return res; -} - -static INLINE __m128i convolve_lo_x(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); - ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); - ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -static INLINE __m128i convolve_lo_y(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); - ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); - ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -static INLINE __m128i convolve_hi_y(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); - ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); - ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -static INLINE __m128i comp_avg(const __m128i *const data_ref_0, - const __m128i *const res_unsigned, - const __m128i *const wt, - const int use_jnt_comp_avg) { - __m128i res; - if (use_jnt_comp_avg) { - const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); - const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); - - const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); - const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); - - const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); - - res = _mm_packs_epi32(res_lo, res_hi); - } else { - const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); - res = _mm_srai_epi16(wt_res, 1); - } - return res; -} - -static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, - const __m128i *const offset_const, - const __m128i *const round_const, - const int round_shift) { - const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); - const __m128i res_round = - _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); - return res_round; -} - -static INLINE __m128i highbd_convolve_rounding_sse2( - const __m128i *const res_unsigned, const __m128i *const offset_const, - const __m128i *const round_const, const int round_shift) { - const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); - const __m128i res_round = - _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); - - return res_round; -} - -#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h deleted file mode 100644 index 6b8388d84..000000000 --- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ -#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ - -// Note: -// This header file should be put below any x86 intrinsics head file - -static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, - const __m128i *const res, - const __m128i *const wt0, - const __m128i *const wt1, - const int do_average) { - __m128i d; - if (do_average) { - d = _mm_load_si128((__m128i *)dst); - d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); - d = _mm_srai_epi32(d, DIST_PRECISION_BITS); - } else { - d = *res; - } - _mm_store_si128((__m128i *)dst, d); -} - -static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, - const __m128i *const res_unsigned, - const __m128i *const wt0, - const __m128i *const wt1, - const int use_jnt_comp_avg) { - __m128i res; - if (use_jnt_comp_avg) { - const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); - const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); - - const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); - res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); - } else { - const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); - res = _mm_srai_epi32(wt_res, 1); - } - return res; -} - -#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c deleted file mode 100644 index 54da02253..000000000 --- a/third_party/aom/aom_dsp/x86/fft_avx2.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/fft_common.h" - -extern void aom_transpose_float_sse2(const float *A, float *B, int n); -extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, - int n); - -// Generate the 1d forward transforms for float using _mm256 -GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); -GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); -GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); - -void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); -} - -void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); -} - -void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); -} - -// Generate the 1d inverse transforms for float using _mm256 -GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); -GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); -GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, - _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, - _mm256_mul_ps); - -void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, - aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); -} - -void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, - aom_fft1d_16_avx2, aom_ifft1d_16_avx2, - aom_transpose_float_sse2, 8); -} - -void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, - aom_fft1d_32_avx2, aom_ifft1d_32_avx2, - aom_transpose_float_sse2, 8); -} diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c deleted file mode 100644 index 12bdc3e18..000000000 --- a/third_party/aom/aom_dsp/x86/fft_sse2.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the -s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <xmmintrin.h> - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/fft_common.h" - -static INLINE void transpose4x4(const float *A, float *B, const int lda, - const int ldb) { - __m128 row1 = _mm_load_ps(&A[0 * lda]); - __m128 row2 = _mm_load_ps(&A[1 * lda]); - __m128 row3 = _mm_load_ps(&A[2 * lda]); - __m128 row4 = _mm_load_ps(&A[3 * lda]); - _MM_TRANSPOSE4_PS(row1, row2, row3, row4); - _mm_store_ps(&B[0 * ldb], row1); - _mm_store_ps(&B[1 * ldb], row2); - _mm_store_ps(&B[2 * ldb], row3); - _mm_store_ps(&B[3 * ldb], row4); -} - -void aom_transpose_float_sse2(const float *A, float *B, int n) { - for (int y = 0; y < n; y += 4) { - for (int x = 0; x < n; x += 4) { - transpose4x4(A + y * n + x, B + x * n + y, n, n); - } - } -} - -void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { - const int n2 = n / 2; - output[0] = packed[0]; - output[1] = 0; - output[2 * (n2 * n)] = packed[n2 * n]; - output[2 * (n2 * n) + 1] = 0; - - output[2 * n2] = packed[n2]; - output[2 * n2 + 1] = 0; - output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; - output[2 * (n2 * n + n2) + 1] = 0; - - for (int c = 1; c < n2; ++c) { - output[2 * (0 * n + c)] = packed[c]; - output[2 * (0 * n + c) + 1] = packed[c + n2]; - output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; - output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; - } - for (int r = 1; r < n2; ++r) { - output[2 * (r * n + 0)] = packed[r * n]; - output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; - output[2 * (r * n + n2) + 0] = packed[r * n + n2]; - output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; - - for (int c = 1; c < AOMMIN(n2, 4); ++c) { - output[2 * (r * n + c)] = - packed[r * n + c] - packed[(r + n2) * n + c + n2]; - output[2 * (r * n + c) + 1] = - packed[(r + n2) * n + c] + packed[r * n + c + n2]; - } - - for (int c = 4; c < n2; c += 4) { - __m128 real1 = _mm_load_ps(packed + r * n + c); - __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); - __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); - __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); - real1 = _mm_sub_ps(real1, real2); - imag1 = _mm_add_ps(imag1, imag2); - _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); - _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); - } - - int r2 = r + n2; - int r3 = n - r2; - output[2 * (r2 * n + 0)] = packed[r3 * n]; - output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; - output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; - output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; - for (int c = 1; c < AOMMIN(4, n2); ++c) { - output[2 * (r2 * n + c)] = - packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; - output[2 * (r2 * n + c) + 1] = - -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; - } - for (int c = 4; c < n2; c += 4) { - __m128 real1 = _mm_load_ps(packed + r3 * n + c); - __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); - __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); - __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); - real1 = _mm_add_ps(real1, real2); - imag1 = _mm_sub_ps(imag2, imag1); - _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); - _mm_store_ps(output + 2 * (r2 * n + c + 2), - _mm_unpackhi_ps(real1, imag1)); - } - } -} - -// Generate definitions for 1d transforms using float and __mm128 -GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps); -GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); -GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); -GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); - -void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); -} - -void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); -} - -void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); -} - -void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { - aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, - aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); -} - -// Generate definitions for 1d inverse transforms using float and mm128 -GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps); -GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); -GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); -GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, - _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); - -void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, - aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); -} - -void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, - aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); -} - -void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, - aom_fft1d_16_sse2, aom_ifft1d_16_sse2, - aom_transpose_float_sse2, 4); -} - -void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { - aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, - aom_fft1d_32_sse2, aom_ifft1d_32_sse2, - aom_transpose_float_sse2, 4); -} diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h deleted file mode 100644 index 1e3d13ec8..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "aom_ports/mem.h" - -// TODO(jingning) The high bit-depth functions need rework for performance. -// After we properly fix the high bit-depth function implementations, this -// file's dependency should be substantially simplified. -#if DCT_HIGH_BIT_DEPTH -#define ADD_EPI16 _mm_adds_epi16 -#define SUB_EPI16 _mm_subs_epi16 - -#else -#define ADD_EPI16 _mm_add_epi16 -#define SUB_EPI16 _mm_sub_epi16 -#endif - -void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = ADD_EPI16(in0, in7); - const __m128i q1 = ADD_EPI16(in1, in6); - const __m128i q2 = ADD_EPI16(in2, in5); - const __m128i q3 = ADD_EPI16(in3, in4); - const __m128i q4 = SUB_EPI16(in3, in4); - const __m128i q5 = SUB_EPI16(in2, in5); - const __m128i q6 = SUB_EPI16(in1, in6); - const __m128i q7 = SUB_EPI16(in0, in7); -#if DCT_HIGH_BIT_DEPTH - if (pass == 1) { - overflow = - check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } - } -#endif // DCT_HIGH_BIT_DEPTH - // Work on first four results - { - // Add/subtract - const __m128i r0 = ADD_EPI16(q0, q3); - const __m128i r1 = ADD_EPI16(q1, q2); - const __m128i r2 = SUB_EPI16(q1, q2); - const __m128i r3 = SUB_EPI16(q0, q3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us into 32bits - { - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&r0, &r1); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // Add/subtract - const __m128i x0 = ADD_EPI16(q4, r0); - const __m128i x1 = SUB_EPI16(q4, r0); - const __m128i x2 = SUB_EPI16(q7, r1); - const __m128i x3 = ADD_EPI16(q7, r1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us into 32bits - { - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); - if (overflow) { - aom_highbd_fdct8x8_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - // store results - store_output(&in0, (output + 0 * 8)); - store_output(&in1, (output + 1 * 8)); - store_output(&in2, (output + 2 * 8)); - store_output(&in3, (output + 3 * 8)); - store_output(&in4, (output + 4 * 8)); - store_output(&in5, (output + 5 * 8)); - store_output(&in6, (output + 6 * 8)); - store_output(&in7, (output + 7 * 8)); - } -} - -#undef ADD_EPI16 -#undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c deleted file mode 100644 index 2d8f8f71e..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/fwd_txfm_sse2.h" - -void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i u0, u1, sum; - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - sum = _mm_add_epi16(u0, u1); - - in0 = _mm_add_epi16(in0, in1); - in2 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, in0); - - u0 = _mm_setzero_si128(); - sum = _mm_add_epi16(sum, in2); - - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); -} - -#define DCT_HIGH_BIT_DEPTH 0 -#define FDCT8x8_2D aom_fdct8x8_sse2 -#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" -#undef FDCT8x8_2D - -#undef DCT_HIGH_BIT_DEPTH -#define DCT_HIGH_BIT_DEPTH 1 -#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 -#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT -#undef FDCT8x8_2D diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h deleted file mode 100644 index 260d8dd58..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { - __m128i buf0, buf1; - buf0 = _mm_mul_epu32(a, b); - a = _mm_srli_epi64(a, 32); - b = _mm_srli_epi64(b, 32); - buf1 = _mm_mul_epu32(a, b); - return _mm_add_epi64(buf0, buf1); -} - -static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { - __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); - __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); - return _mm_unpacklo_epi64(buf0, buf1); -} - -static INLINE int check_epi16_overflow_x2(const __m128i *preg0, - const __m128i *preg1) { - const __m128i max_overflow = _mm_set1_epi16(0x7fff); - const __m128i min_overflow = _mm_set1_epi16(0x8000); - __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), - _mm_cmpeq_epi16(*preg0, min_overflow)); - __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), - _mm_cmpeq_epi16(*preg1, min_overflow)); - cmp0 = _mm_or_si128(cmp0, cmp1); - return _mm_movemask_epi8(cmp0); -} - -static INLINE int check_epi16_overflow_x4(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3) { - const __m128i max_overflow = _mm_set1_epi16(0x7fff); - const __m128i min_overflow = _mm_set1_epi16(0x8000); - __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), - _mm_cmpeq_epi16(*preg0, min_overflow)); - __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), - _mm_cmpeq_epi16(*preg1, min_overflow)); - __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), - _mm_cmpeq_epi16(*preg2, min_overflow)); - __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), - _mm_cmpeq_epi16(*preg3, min_overflow)); - cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); - return _mm_movemask_epi8(cmp0); -} - -static INLINE int check_epi16_overflow_x8( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7) { - int res0, res1; - res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); - res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); - return res0 + res1; -} - -static INLINE int check_epi16_overflow_x12( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { - int res0, res1; - res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); - res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); - if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); - return res0 + res1; -} - -static INLINE int check_epi16_overflow_x16( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15) { - int res0, res1; - res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); - res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); - if (!res0) { - res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); - if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); - } - return res0 + res1; -} - -static INLINE int check_epi16_overflow_x32( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, - const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, - const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, - const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, - const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, - const __m128i *preg30, const __m128i *preg31) { - int res0, res1; - res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); - res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); - if (!res0) { - res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); - if (!res1) { - res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); - if (!res0) { - res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); - if (!res1) { - res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); - if (!res0) { - res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); - if (!res1) - res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); - } - } - } - } - } - return res0 + res1; -} - -static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { - if (sizeof(tran_low_t) == 4) { - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); - } else { - _mm_store_si128((__m128i *)(dst_ptr), *poutput); - } -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm deleted file mode 100644 index c1fb259a1..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,379 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 -pd_8192: times 4 dd 8192 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 -%endmacro - -TRANSFORM_COEFFS 11585, 11585 -TRANSFORM_COEFFS 15137, 6270 -TRANSFORM_COEFFS 16069, 3196 -TRANSFORM_COEFFS 9102, 13623 - -%macro STORE_OUTPUT 2 ; index, result - ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - ; _mm_store_si128((__m128i *)(dst_ptr), out0); - ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); - pxor m11, m11 - pcmpgtw m11, m%2 - movdqa m12, m%2 - punpcklwd m%2, m11 - punpckhwd m12, m11 - mova [outputq + 4*%1 + 0], m%2 - mova [outputq + 4*%1 + 16], m12 -%endmacro - -SECTION .text - -%if ARCH_X86_64 -INIT_XMM ssse3 -cglobal fdct8x8, 3, 5, 13, input, output, stride - - mova m8, [GLOBAL(pd_8192)] - mova m12, [GLOBAL(pw_11585x2)] - - lea r3, [2 * strideq] - lea r4, [4 * strideq] - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - ; left shift by 2 to increase forward transformation precision - psllw m0, 2 - psllw m1, 2 - psllw m2, 2 - psllw m3, 2 - psllw m4, 2 - psllw m5, 2 - psllw m6, 2 - psllw m7, 2 - - ; column transform - ; stage 1 - paddw m10, m0, m7 - psubw m0, m7 - - paddw m9, m1, m6 - psubw m1, m6 - - paddw m7, m2, m5 - psubw m2, m5 - - paddw m6, m3, m4 - psubw m3, m4 - - ; stage 2 - paddw m5, m9, m7 - psubw m9, m7 - - paddw m4, m10, m6 - psubw m10, m6 - - paddw m7, m1, m2 - psubw m1, m2 - - ; stage 3 - paddw m6, m4, m5 - psubw m4, m5 - - pmulhrsw m1, m12 - pmulhrsw m7, m12 - - ; sin(pi / 8), cos(pi / 8) - punpcklwd m2, m10, m9 - punpckhwd m10, m9 - pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] - pmaddwd m2, [GLOBAL(pw_6270_m15137)] - pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] - pmaddwd m10, [GLOBAL(pw_6270_m15137)] - paddd m5, m8 - paddd m2, m8 - paddd m9, m8 - paddd m10, m8 - psrad m5, 14 - psrad m2, 14 - psrad m9, 14 - psrad m10, 14 - packssdw m5, m9 - packssdw m2, m10 - - pmulhrsw m6, m12 - pmulhrsw m4, m12 - - paddw m9, m3, m1 - psubw m3, m1 - - paddw m10, m0, m7 - psubw m0, m7 - - ; stage 4 - ; sin(pi / 16), cos(pi / 16) - punpcklwd m1, m10, m9 - punpckhwd m10, m9 - pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] - pmaddwd m1, [GLOBAL(pw_3196_m16069)] - pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] - pmaddwd m10, [GLOBAL(pw_3196_m16069)] - paddd m7, m8 - paddd m1, m8 - paddd m9, m8 - paddd m10, m8 - psrad m7, 14 - psrad m1, 14 - psrad m9, 14 - psrad m10, 14 - packssdw m7, m9 - packssdw m1, m10 - - ; sin(3 * pi / 16), cos(3 * pi / 16) - punpcklwd m11, m0, m3 - punpckhwd m0, m3 - pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] - pmaddwd m11, [GLOBAL(pw_13623_m9102)] - pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] - pmaddwd m0, [GLOBAL(pw_13623_m9102)] - paddd m9, m8 - paddd m11, m8 - paddd m3, m8 - paddd m0, m8 - psrad m9, 14 - psrad m11, 14 - psrad m3, 14 - psrad m0, 14 - packssdw m9, m3 - packssdw m11, m0 - - ; transpose - ; stage 1 - punpcklwd m0, m6, m7 - punpcklwd m3, m5, m11 - punpckhwd m6, m7 - punpckhwd m5, m11 - punpcklwd m7, m4, m9 - punpcklwd m10, m2, m1 - punpckhwd m4, m9 - punpckhwd m2, m1 - - ; stage 2 - punpckldq m9, m0, m3 - punpckldq m1, m6, m5 - punpckhdq m0, m3 - punpckhdq m6, m5 - punpckldq m3, m7, m10 - punpckldq m5, m4, m2 - punpckhdq m7, m10 - punpckhdq m4, m2 - - ; stage 3 - punpcklqdq m10, m9, m3 - punpckhqdq m9, m3 - punpcklqdq m2, m0, m7 - punpckhqdq m0, m7 - punpcklqdq m3, m1, m5 - punpckhqdq m1, m5 - punpcklqdq m7, m6, m4 - punpckhqdq m6, m4 - - ; row transform - ; stage 1 - paddw m5, m10, m6 - psubw m10, m6 - - paddw m4, m9, m7 - psubw m9, m7 - - paddw m6, m2, m1 - psubw m2, m1 - - paddw m7, m0, m3 - psubw m0, m3 - - ;stage 2 - paddw m1, m5, m7 - psubw m5, m7 - - paddw m3, m4, m6 - psubw m4, m6 - - paddw m7, m9, m2 - psubw m9, m2 - - ; stage 3 - punpcklwd m6, m1, m3 - punpckhwd m1, m3 - pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] - pmaddwd m6, [GLOBAL(pw_11585_m11585)] - pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] - pmaddwd m1, [GLOBAL(pw_11585_m11585)] - paddd m2, m8 - paddd m6, m8 - paddd m3, m8 - paddd m1, m8 - psrad m2, 14 - psrad m6, 14 - psrad m3, 14 - psrad m1, 14 - packssdw m2, m3 - packssdw m6, m1 - - pmulhrsw m7, m12 - pmulhrsw m9, m12 - - punpcklwd m3, m5, m4 - punpckhwd m5, m4 - pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] - pmaddwd m3, [GLOBAL(pw_6270_m15137)] - pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] - pmaddwd m5, [GLOBAL(pw_6270_m15137)] - paddd m1, m8 - paddd m3, m8 - paddd m4, m8 - paddd m5, m8 - psrad m1, 14 - psrad m3, 14 - psrad m4, 14 - psrad m5, 14 - packssdw m1, m4 - packssdw m3, m5 - - paddw m4, m0, m9 - psubw m0, m9 - - paddw m5, m10, m7 - psubw m10, m7 - - ; stage 4 - punpcklwd m9, m5, m4 - punpckhwd m5, m4 - pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] - pmaddwd m9, [GLOBAL(pw_3196_m16069)] - pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] - pmaddwd m5, [GLOBAL(pw_3196_m16069)] - paddd m7, m8 - paddd m9, m8 - paddd m4, m8 - paddd m5, m8 - psrad m7, 14 - psrad m9, 14 - psrad m4, 14 - psrad m5, 14 - packssdw m7, m4 - packssdw m9, m5 - - punpcklwd m4, m10, m0 - punpckhwd m10, m0 - pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] - pmaddwd m4, [GLOBAL(pw_13623_m9102)] - pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] - pmaddwd m10, [GLOBAL(pw_13623_m9102)] - paddd m5, m8 - paddd m4, m8 - paddd m0, m8 - paddd m10, m8 - psrad m5, 14 - psrad m4, 14 - psrad m0, 14 - psrad m10, 14 - packssdw m5, m0 - packssdw m4, m10 - - ; transpose - ; stage 1 - punpcklwd m0, m2, m7 - punpcklwd m10, m1, m4 - punpckhwd m2, m7 - punpckhwd m1, m4 - punpcklwd m7, m6, m5 - punpcklwd m4, m3, m9 - punpckhwd m6, m5 - punpckhwd m3, m9 - - ; stage 2 - punpckldq m5, m0, m10 - punpckldq m9, m2, m1 - punpckhdq m0, m10 - punpckhdq m2, m1 - punpckldq m10, m7, m4 - punpckldq m1, m6, m3 - punpckhdq m7, m4 - punpckhdq m6, m3 - - ; stage 3 - punpcklqdq m4, m5, m10 - punpckhqdq m5, m10 - punpcklqdq m3, m0, m7 - punpckhqdq m0, m7 - punpcklqdq m10, m9, m1 - punpckhqdq m9, m1 - punpcklqdq m7, m2, m6 - punpckhqdq m2, m6 - - psraw m1, m4, 15 - psraw m6, m5, 15 - psraw m8, m3, 15 - psraw m11, m0, 15 - - psubw m4, m1 - psubw m5, m6 - psubw m3, m8 - psubw m0, m11 - - psraw m4, 1 - psraw m5, 1 - psraw m3, 1 - psraw m0, 1 - - psraw m1, m10, 15 - psraw m6, m9, 15 - psraw m8, m7, 15 - psraw m11, m2, 15 - - psubw m10, m1 - psubw m9, m6 - psubw m7, m8 - psubw m2, m11 - - psraw m10, 1 - psraw m9, 1 - psraw m7, 1 - psraw m2, 1 - - STORE_OUTPUT 0, 4 - STORE_OUTPUT 8, 5 - STORE_OUTPUT 16, 3 - STORE_OUTPUT 24, 0 - STORE_OUTPUT 32, 10 - STORE_OUTPUT 40, 9 - STORE_OUTPUT 48, 7 - STORE_OUTPUT 56, 2 - - RET -%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c deleted file mode 100644 index 099fcf7fc..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ /dev/null @@ -1,998 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <immintrin.h> -#include <string.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve.h" -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/synonyms.h" - -// ----------------------------------------------------------------------------- -// Copy and average - -void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int width, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - (void)bd; - - assert(width % 4 == 0); - if (width > 32) { // width = 64 - do { - const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); - const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); - const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); - src += src_stride; - _mm256_storeu_si256((__m256i *)dst, p0); - _mm256_storeu_si256((__m256i *)(dst + 16), p1); - _mm256_storeu_si256((__m256i *)(dst + 32), p2); - _mm256_storeu_si256((__m256i *)(dst + 48), p3); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 16) { // width = 32 - do { - const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); - const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - src += src_stride; - _mm256_storeu_si256((__m256i *)dst, p0); - _mm256_storeu_si256((__m256i *)(dst + 16), p1); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 8) { // width = 16 - __m256i p0, p1; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - src += src_stride; - p1 = _mm256_loadu_si256((const __m256i *)src); - src += src_stride; - - _mm256_storeu_si256((__m256i *)dst, p0); - dst += dst_stride; - _mm256_storeu_si256((__m256i *)dst, p1); - dst += dst_stride; - h -= 2; - } while (h > 0); - } else if (width > 4) { // width = 8 - __m128i p0, p1; - do { - p0 = _mm_loadu_si128((const __m128i *)src); - src += src_stride; - p1 = _mm_loadu_si128((const __m128i *)src); - src += src_stride; - - _mm_storeu_si128((__m128i *)dst, p0); - dst += dst_stride; - _mm_storeu_si128((__m128i *)dst, p1); - dst += dst_stride; - h -= 2; - } while (h > 0); - } else { // width = 4 - __m128i p0, p1; - do { - p0 = _mm_loadl_epi64((const __m128i *)src); - src += src_stride; - p1 = _mm_loadl_epi64((const __m128i *)src); - src += src_stride; - - _mm_storel_epi64((__m128i *)dst, p0); - dst += dst_stride; - _mm_storel_epi64((__m128i *)dst, p1); - dst += dst_stride; - h -= 2; - } while (h > 0); - } -} - -void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride; - (void)filter_params_x; - (void)subpel_x_q4; - (void)conv_params; - - assert(conv_params->round_0 <= FILTER_BITS); - assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || - ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - - __m256i s[8], coeffs_y[4]; - - const int bits = FILTER_BITS; - - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); - const __m256i clip_pixel = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m256i zero = _mm256_setzero_si256(); - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - const uint16_t *data = &src_ptr[j]; - /* Vertical filter */ - { - __m256i src6; - __m256i s01 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - __m256i s12 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - __m256i s23 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - __m256i s34 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - __m256i s45 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - __m256i s56 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi16(s01, s12); - s[1] = _mm256_unpacklo_epi16(s23, s34); - s[2] = _mm256_unpacklo_epi16(s45, s56); - - s[4] = _mm256_unpackhi_epi16(s01, s12); - s[5] = _mm256_unpackhi_epi16(s23, s34); - s[6] = _mm256_unpackhi_epi16(s45, s56); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - - const __m256i s67 = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - - const __m256i s78 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); - - s[3] = _mm256_unpacklo_epi16(s67, s78); - s[7] = _mm256_unpackhi_epi16(s67, s78); - - const __m256i res_a = convolve(s, coeffs_y); - - __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); - - if (w - j > 4) { - const __m256i res_b = convolve(s + 4, coeffs_y); - __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); - - __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); - res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); - res_16bit = _mm256_max_epi16(res_16bit, zero); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_16bit)); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_16bit, 1)); - } else if (w == 4) { - res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); - res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); - res_a_round = _mm256_max_epi16(res_a_round, zero); - - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_a_round)); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_a_round, 1)); - } else { - res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); - res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); - res_a_round = _mm256_max_epi16(res_a_round, zero); - - xx_storel_32((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_a_round)); - xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_a_round, 1)); - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} - -void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_horiz; - (void)subpel_y_q4; - (void)filter_params_y; - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - - __m256i s[4], coeffs_x[4]; - - const __m256i round_const_x = - _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - - const int bits = FILTER_BITS - conv_params->round_0; - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); - const __m256i clip_pixel = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m256i zero = _mm256_setzero_si256(); - - assert(bits >= 0); - assert((FILTER_BITS - conv_params->round_1) >= 0 || - ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - for (i = 0; i < h; i += 2) { - const __m256i row0 = - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); - __m256i row1 = - _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); - - const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); - const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); - - // even pixels - s[0] = _mm256_alignr_epi8(r1, r0, 0); - s[1] = _mm256_alignr_epi8(r1, r0, 4); - s[2] = _mm256_alignr_epi8(r1, r0, 8); - s[3] = _mm256_alignr_epi8(r1, r0, 12); - - __m256i res_even = convolve(s, coeffs_x); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm256_alignr_epi8(r1, r0, 2); - s[1] = _mm256_alignr_epi8(r1, r0, 6); - s[2] = _mm256_alignr_epi8(r1, r0, 10); - s[3] = _mm256_alignr_epi8(r1, r0, 14); - - __m256i res_odd = convolve(s, coeffs_x); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), - round_shift_x); - - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), - round_shift_bits); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), - round_shift_bits); - - __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); - __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); - - __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); - res = _mm256_min_epi16(res, clip_pixel); - res = _mm256_max_epi16(res, zero); - - if (w - j > 4) { - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res)); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res, 1)); - } else if (w == 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res)); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res, 1)); - } else { - xx_storel_32((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res)); - xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res, 1)); - } - } - } -} - -#define CONV8_ROUNDING_BITS (7) - -// ----------------------------------------------------------------------------- -// Horizontal and vertical filtering - -static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, - 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, - 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; - -static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 12, 13, - 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 12, 13 }; - -static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, - 10, 11, 12, 13, 12, 13, 14, 15, - 6, 7, 8, 9, 8, 9, 10, 11, - 10, 11, 12, 13, 12, 13, 14, 15 }; - -static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; - -// ----------------------------------------------------------------------------- -// Horizontal Filtering - -static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { - const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); - const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); - const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); - const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); - - p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 - p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 - p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 - p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 -} - -// Note: -// Shared by 8x2 and 16x1 block -static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, - __m256i *x /*x[8]*/) { - __m256i pp[8]; - pack_pixels(s0, pp); - pack_pixels(s1, &pp[4]); - x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); - x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); - x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); - x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); - x[4] = x[2]; - x[5] = x[3]; - x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); - x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); -} - -static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { - __m256i pp[8]; - __m256i s0; - s0 = _mm256_loadu_si256((const __m256i *)src); - pack_pixels(&s0, pp); - x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); - x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); - x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); - x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); -} - -static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, - __m256i *x) { - __m256i s0, s1; - s0 = _mm256_loadu_si256((const __m256i *)src); - s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); - pack_16_pixels(&s0, &s1, x); -} - -static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { - __m256i s0, s1; - s0 = _mm256_loadu_si256((const __m256i *)src); - s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); - pack_16_pixels(&s0, &s1, x); -} - -// Note: -// Shared by horizontal and vertical filtering -static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { - const __m128i h = _mm_loadu_si128((const __m128i *)filter); - const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); - const __m256i p0 = _mm256_set1_epi32(0x03020100); - const __m256i p1 = _mm256_set1_epi32(0x07060504); - const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); - const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); - f[0] = _mm256_shuffle_epi8(hh, p0); - f[1] = _mm256_shuffle_epi8(hh, p1); - f[2] = _mm256_shuffle_epi8(hh, p2); - f[3] = _mm256_shuffle_epi8(hh, p3); -} - -static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, - const __m256i *fil /*fil[4]*/, - __m256i *y) { - __m256i a, a0, a1; - - a0 = _mm256_madd_epi16(fil[0], sig[0]); - a1 = _mm256_madd_epi16(fil[3], sig[3]); - a = _mm256_add_epi32(a0, a1); - - a0 = _mm256_madd_epi16(fil[1], sig[1]); - a1 = _mm256_madd_epi16(fil[2], sig[2]); - - { - const __m256i min = _mm256_min_epi32(a0, a1); - a = _mm256_add_epi32(a, min); - } - { - const __m256i max = _mm256_max_epi32(a0, a1); - a = _mm256_add_epi32(a, max); - } - { - const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); - a = _mm256_add_epi32(a, rounding); - *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); - } -} - -static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, - uint16_t *dst) { - const __m128i a0 = _mm256_castsi256_si128(*y); - const __m128i a1 = _mm256_extractf128_si256(*y, 1); - __m128i res = _mm_packus_epi32(a0, a1); - res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); - _mm_storeu_si128((__m128i *)dst, res); -} - -static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - a = _mm256_min_epi16(a, *mask); - _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); - _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); -} - -static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - a = _mm256_min_epi16(a, *mask); - _mm256_storeu_si256((__m256i *)dst, a); -} - -static void aom_highbd_filter_block1d8_h8_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_8x2_pixels(src_ptr, src_pitch, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - store_8x1_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h8_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_16x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_16x1_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -// ----------------------------------------------------------------------------- -// 2-tap horizontal filtering - -static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { - const __m128i h = _mm_loadu_si128((const __m128i *)filter); - const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); - const __m256i p = _mm256_set1_epi32(0x09080706); - f[0] = _mm256_shuffle_epi8(hh, p); -} - -// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() -// the difference is s0/s1 specifies first and second rows or, -// first 16 samples and 8-sample shifted 16 samples -static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, - __m256i *sig) { - const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); - const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); - __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); - __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); - __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); - __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); - r0 = _mm256_shuffle_epi8(r0, sf2); - r1 = _mm256_shuffle_epi8(r1, sf2); - sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); - sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); -} - -static INLINE void pack_8x2_2t_pixels(const uint16_t *src, - const ptrdiff_t pitch, __m256i *sig) { - const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); - const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); - pack_16_2t_pixels(&r0, &r1, sig); -} - -static INLINE void pack_16x1_2t_pixels(const uint16_t *src, - __m256i *sig /*sig[2]*/) { - const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); - const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); - pack_16_2t_pixels(&r0, &r1, sig); -} - -static INLINE void pack_8x1_2t_pixels(const uint16_t *src, - __m256i *sig /*sig[2]*/) { - const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); - const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); - __m256i r0 = _mm256_loadu_si256((const __m256i *)src); - __m256i x0 = _mm256_shuffle_epi8(r0, sf2); - r0 = _mm256_permutevar8x32_epi32(r0, idx); - r0 = _mm256_shuffle_epi8(r0, sf2); - sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); -} - -// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() -static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); - __m256i x0 = _mm256_madd_epi16(sig[0], *f); - __m256i x1 = _mm256_madd_epi16(sig[1], *f); - x0 = _mm256_add_epi32(x0, rounding); - x1 = _mm256_add_epi32(x1, rounding); - *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); - *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); -} - -static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0) { - const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); - __m256i x0 = _mm256_madd_epi16(sig[0], *f); - x0 = _mm256_add_epi32(x0, rounding); - *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); -} - -static void aom_highbd_filter_block1d8_h2_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_8x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_2t_pixels(src_ptr, signal); - filter_8x1_2t_pixels(signal, &ff, &res0); - store_8x1_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h2_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_16x1_2t_pixels(src_ptr, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -// ----------------------------------------------------------------------------- -// Vertical Filtering - -static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { - __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); - __m256i s1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); - __m256i s2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); - __m256i s3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); - __m256i s4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); - __m256i s5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); - __m256i s6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); - - s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); - s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); - s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); - s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); - s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); - s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); - - sig[0] = _mm256_unpacklo_epi16(s0, s1); - sig[4] = _mm256_unpackhi_epi16(s0, s1); - sig[1] = _mm256_unpacklo_epi16(s2, s3); - sig[5] = _mm256_unpackhi_epi16(s2, s3); - sig[2] = _mm256_unpacklo_epi16(s4, s5); - sig[6] = _mm256_unpackhi_epi16(s4, s5); - sig[8] = s6; -} - -static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, - __m256i *sig) { - // base + 7th row - __m256i s0 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); - // base + 8th row - __m256i s1 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); - __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); - __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); - sig[3] = _mm256_unpacklo_epi16(s2, s3); - sig[7] = _mm256_unpackhi_epi16(s2, s3); - sig[8] = s1; -} - -static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - filter_8x1_pixels(sig, f, y0); - filter_8x1_pixels(&sig[4], f, y1); -} - -static INLINE void update_pixels(__m256i *sig) { - int i; - for (i = 0; i < 3; ++i) { - sig[i] = sig[i + 1]; - sig[i + 4] = sig[i + 5]; - } -} - -static void aom_highbd_filter_block1d8_v8_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[9], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_8x9_init(src_ptr, src_pitch, signal); - - do { - pack_8x9_pixels(src_ptr, src_pitch, signal); - - filter_8x9_pixels(signal, ff, &res0, &res1); - store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { - __m256i u0, u1, u2, u3; - // load 0-6 rows - const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); - const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); - const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); - const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); - const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); - const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); - const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); - - u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low - u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high - - u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low - u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high - - sig[0] = _mm256_unpacklo_epi16(u0, u2); - sig[4] = _mm256_unpackhi_epi16(u0, u2); - - sig[8] = _mm256_unpacklo_epi16(u1, u3); - sig[12] = _mm256_unpackhi_epi16(u1, u3); - - u0 = _mm256_permute2x128_si256(s2, s3, 0x20); - u1 = _mm256_permute2x128_si256(s2, s3, 0x31); - - u2 = _mm256_permute2x128_si256(s3, s4, 0x20); - u3 = _mm256_permute2x128_si256(s3, s4, 0x31); - - sig[1] = _mm256_unpacklo_epi16(u0, u2); - sig[5] = _mm256_unpackhi_epi16(u0, u2); - - sig[9] = _mm256_unpacklo_epi16(u1, u3); - sig[13] = _mm256_unpackhi_epi16(u1, u3); - - u0 = _mm256_permute2x128_si256(s4, s5, 0x20); - u1 = _mm256_permute2x128_si256(s4, s5, 0x31); - - u2 = _mm256_permute2x128_si256(s5, s6, 0x20); - u3 = _mm256_permute2x128_si256(s5, s6, 0x31); - - sig[2] = _mm256_unpacklo_epi16(u0, u2); - sig[6] = _mm256_unpackhi_epi16(u0, u2); - - sig[10] = _mm256_unpacklo_epi16(u1, u3); - sig[14] = _mm256_unpackhi_epi16(u1, u3); - - sig[16] = s6; -} - -static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, - __m256i *sig) { - // base + 7th row - const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); - // base + 8th row - const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); - - __m256i u0, u1, u2, u3; - u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); - u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); - - u2 = _mm256_permute2x128_si256(s7, s8, 0x20); - u3 = _mm256_permute2x128_si256(s7, s8, 0x31); - - sig[3] = _mm256_unpacklo_epi16(u0, u2); - sig[7] = _mm256_unpackhi_epi16(u0, u2); - - sig[11] = _mm256_unpacklo_epi16(u1, u3); - sig[15] = _mm256_unpackhi_epi16(u1, u3); - - sig[16] = s8; -} - -static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - __m256i res[4]; - int i; - for (i = 0; i < 4; ++i) { - filter_8x1_pixels(&sig[i << 2], f, &res[i]); - } - - { - const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); - const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); - *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); - *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); - } -} - -static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - __m256i p = _mm256_min_epi16(*y0, *mask); - _mm256_storeu_si256((__m256i *)dst, p); - p = _mm256_min_epi16(*y1, *mask); - _mm256_storeu_si256((__m256i *)(dst + pitch), p); -} - -static void update_16x9_pixels(__m256i *sig) { - update_pixels(&sig[0]); - update_pixels(&sig[8]); -} - -static void aom_highbd_filter_block1d16_v8_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[17], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_16x9_init(src_ptr, src_pitch, signal); - - do { - pack_16x9_pixels(src_ptr, src_pitch, signal); - filter_16x9_pixels(signal, ff, &res0, &res1); - store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_16x9_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -// ----------------------------------------------------------------------------- -// 2-tap vertical filtering - -static void pack_16x2_init(const uint16_t *src, __m256i *sig) { - sig[2] = _mm256_loadu_si256((const __m256i *)src); -} - -static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, - __m256i *sig) { - // load the next row - const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); - sig[0] = _mm256_unpacklo_epi16(sig[2], u); - sig[1] = _mm256_unpackhi_epi16(sig[2], u); - sig[2] = u; -} - -static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - filter_16_2t_pixels(sig, f, y0, y1); -} - -static void aom_highbd_filter_block1d16_v2_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[3], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - __m256i ff; - - pack_2t_filter(filter, &ff); - pack_16x2_init(src_ptr, signal); - - do { - pack_16x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16x2_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_pixels(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - -static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { - const __m128i h = _mm_loadu_si128((const __m128i *)filter); - const __m128i p = _mm_set1_epi32(0x09080706); - f[0] = _mm_shuffle_epi8(h, p); -} - -static void pack_8x2_init(const uint16_t *src, __m128i *sig) { - sig[2] = _mm_loadu_si128((const __m128i *)src); -} - -static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, - __m128i *sig) { - // load the next row - const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); - sig[0] = _mm_unpacklo_epi16(sig[2], u); - sig[1] = _mm_unpackhi_epi16(sig[2], u); - sig[2] = u; -} - -static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, - __m128i *y0, __m128i *y1) { - const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); - __m128i x0 = _mm_madd_epi16(sig[0], *f); - __m128i x1 = _mm_madd_epi16(sig[1], *f); - x0 = _mm_add_epi32(x0, rounding); - x1 = _mm_add_epi32(x1, rounding); - *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); - *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); -} - -static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, - const __m128i *mask, uint16_t *dst) { - __m128i res = _mm_packus_epi32(*y0, *y1); - res = _mm_min_epi16(res, *mask); - _mm_storeu_si128((__m128i *)dst, res); -} - -static void aom_highbd_filter_block1d8_v2_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m128i signal[3], res0, res1; - const __m128i max = _mm_set1_epi16((1 << bd) - 1); - __m128i ff; - - pack_8x1_2t_filter(filter, &ff); - pack_8x2_init(src_ptr, signal); - - do { - pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); - filter_8_2t_pixels(signal, &ff, &res0, &res1); - store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - -void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 -#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 -#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 -#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 - -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -#undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c deleted file mode 100644 index e7b33d1c4..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve_sse2.h" - -void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride; - (void)filter_params_x; - (void)subpel_x_q4; - (void)conv_params; - - assert(conv_params->round_0 <= FILTER_BITS); - assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || - ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - - __m128i s[16], coeffs_y[4]; - - const int bits = FILTER_BITS; - - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); - const __m128i clip_pixel = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m128i zero = _mm_setzero_si128(); - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - const uint16_t *data = &src_ptr[j]; - /* Vertical filter */ - { - __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); - __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); - __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); - __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); - __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); - __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); - __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); - - s[0] = _mm_unpacklo_epi16(s0, s1); - s[1] = _mm_unpacklo_epi16(s2, s3); - s[2] = _mm_unpacklo_epi16(s4, s5); - - s[4] = _mm_unpackhi_epi16(s0, s1); - s[5] = _mm_unpackhi_epi16(s2, s3); - s[6] = _mm_unpackhi_epi16(s4, s5); - - s[0 + 8] = _mm_unpacklo_epi16(s1, s2); - s[1 + 8] = _mm_unpacklo_epi16(s3, s4); - s[2 + 8] = _mm_unpacklo_epi16(s5, s6); - - s[4 + 8] = _mm_unpackhi_epi16(s1, s2); - s[5 + 8] = _mm_unpackhi_epi16(s3, s4); - s[6 + 8] = _mm_unpackhi_epi16(s5, s6); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - - __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); - __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); - - s[3] = _mm_unpacklo_epi16(s6, s7); - s[7] = _mm_unpackhi_epi16(s6, s7); - - s[3 + 8] = _mm_unpacklo_epi16(s7, s8); - s[7 + 8] = _mm_unpackhi_epi16(s7, s8); - - const __m128i res_a0 = convolve(s, coeffs_y); - __m128i res_a_round0 = _mm_sra_epi32( - _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); - - const __m128i res_a1 = convolve(s + 8, coeffs_y); - __m128i res_a_round1 = _mm_sra_epi32( - _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); - - if (w - j > 4) { - const __m128i res_b0 = convolve(s + 4, coeffs_y); - __m128i res_b_round0 = _mm_sra_epi32( - _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); - - const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); - __m128i res_b_round1 = _mm_sra_epi32( - _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); - - __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); - res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); - res_16bit0 = _mm_max_epi16(res_16bit0, zero); - - __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); - res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); - res_16bit1 = _mm_max_epi16(res_16bit1, zero); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_16bit1); - } else if (w == 4) { - res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); - res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); - res_a_round0 = _mm_max_epi16(res_a_round0, zero); - - res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); - res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); - res_a_round1 = _mm_max_epi16(res_a_round1, zero); - - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_a_round1); - } else { - res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); - res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); - res_a_round0 = _mm_max_epi16(res_a_round0, zero); - - res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); - res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); - res_a_round1 = _mm_max_epi16(res_a_round1, zero); - - *((uint32_t *)(&dst[i * dst_stride + j])) = - _mm_cvtsi128_si32(res_a_round0); - - *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = - _mm_cvtsi128_si32(res_a_round1); - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - - s[0 + 8] = s[1 + 8]; - s[1 + 8] = s[2 + 8]; - s[2 + 8] = s[3 + 8]; - - s[4 + 8] = s[5 + 8]; - s[5 + 8] = s[6 + 8]; - s[6 + 8] = s[7 + 8]; - - s6 = s8; - } - } - } -} - -void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_horiz; - (void)subpel_y_q4; - (void)filter_params_y; - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - - __m128i s[4], coeffs_x[4]; - - const __m128i round_const_x = - _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - - const int bits = FILTER_BITS - conv_params->round_0; - - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); - const __m128i clip_pixel = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m128i zero = _mm_setzero_si128(); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { - for (i = 0; i < h; i += 1) { - const __m128i row00 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i row01 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); - - // even pixels - s[0] = _mm_alignr_epi8(row01, row00, 0); - s[1] = _mm_alignr_epi8(row01, row00, 4); - s[2] = _mm_alignr_epi8(row01, row00, 8); - s[3] = _mm_alignr_epi8(row01, row00, 12); - - __m128i res_even = convolve(s, coeffs_x); - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm_alignr_epi8(row01, row00, 2); - s[1] = _mm_alignr_epi8(row01, row00, 6); - s[2] = _mm_alignr_epi8(row01, row00, 10); - s[3] = _mm_alignr_epi8(row01, row00, 14); - - __m128i res_odd = convolve(s, coeffs_x); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); - - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), - round_shift_bits); - res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), - round_shift_bits); - - __m128i res_even1 = _mm_packs_epi32(res_even, res_even); - __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); - __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); - - res = _mm_min_epi16(res, clip_pixel); - res = _mm_max_epi16(res, zero); - - if (w - j > 4) { - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); - } else if (w == 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); - } else { - *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); - } - } - } - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c deleted file mode 100644 index 5a55736c4..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c +++ /dev/null @@ -1,984 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -// ----------------------------------------------------------------------------- -// H_PRED - -void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); - const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); - const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); - const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); - const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); - (void)above; - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); - dst += stride << 2; - left += 4; - aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); -} - -void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i left_u16 = _mm_load_si128((const __m128i *)left); - const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); - const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); - const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); - const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); - (void)above; - (void)bd; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); -} - -void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i left_u16 = _mm_load_si128((const __m128i *)left); - const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); - const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); - const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); - const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); - const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); - const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); - const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); - const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); - (void)above; - (void)bd; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); - dst += stride; - _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); -} - -void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); - dst += stride << 3; - left += 8; - aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); -} - -static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, - const __m128i *row) { - const __m128i val = _mm_unpacklo_epi64(*row, *row); - _mm_store_si128((__m128i *)*dst, val); - _mm_store_si128((__m128i *)(*dst + 8), val); - *dst += stride; -} - -static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, - const __m128i *row) { - const __m128i val = _mm_unpackhi_epi64(*row, *row); - _mm_store_si128((__m128i *)(*dst), val); - _mm_store_si128((__m128i *)(*dst + 8), val); - *dst += stride; -} - -static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, - const uint16_t *left) { - const __m128i left_u16 = _mm_load_si128((const __m128i *)left); - const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); - const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); - const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); - const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); - const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); - const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); - const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); - const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); - h_store_16_unpacklo(&dst, stride, &row0); - h_store_16_unpacklo(&dst, stride, &row1); - h_store_16_unpacklo(&dst, stride, &row2); - h_store_16_unpacklo(&dst, stride, &row3); - h_store_16_unpackhi(&dst, stride, &row4); - h_store_16_unpackhi(&dst, stride, &row5); - h_store_16_unpackhi(&dst, stride, &row6); - h_store_16_unpackhi(&dst, stride, &row7); -} - -void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)above; - (void)bd; - h_predictor_16x8(dst, stride, left); -} - -void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - int i; - (void)above; - (void)bd; - - for (i = 0; i < 2; i++, left += 8) { - h_predictor_16x8(dst, stride, left); - dst += stride << 3; - } -} - -void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - int i; - (void)above; - (void)bd; - - for (i = 0; i < 4; i++, left += 8) { - h_predictor_16x8(dst, stride, left); - dst += stride << 3; - } -} - -static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, - const __m128i *row) { - const __m128i val = _mm_unpacklo_epi64(*row, *row); - _mm_store_si128((__m128i *)(*dst), val); - _mm_store_si128((__m128i *)(*dst + 8), val); - _mm_store_si128((__m128i *)(*dst + 16), val); - _mm_store_si128((__m128i *)(*dst + 24), val); - *dst += stride; -} - -static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, - const __m128i *row) { - const __m128i val = _mm_unpackhi_epi64(*row, *row); - _mm_store_si128((__m128i *)(*dst), val); - _mm_store_si128((__m128i *)(*dst + 8), val); - _mm_store_si128((__m128i *)(*dst + 16), val); - _mm_store_si128((__m128i *)(*dst + 24), val); - *dst += stride; -} - -static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, - const uint16_t *left) { - const __m128i left_u16 = _mm_load_si128((const __m128i *)left); - const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); - const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); - const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); - const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); - const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); - const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); - const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); - const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); - h_store_32_unpacklo(&dst, stride, &row0); - h_store_32_unpacklo(&dst, stride, &row1); - h_store_32_unpacklo(&dst, stride, &row2); - h_store_32_unpacklo(&dst, stride, &row3); - h_store_32_unpackhi(&dst, stride, &row4); - h_store_32_unpackhi(&dst, stride, &row5); - h_store_32_unpackhi(&dst, stride, &row6); - h_store_32_unpackhi(&dst, stride, &row7); -} - -void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - int i; - (void)above; - (void)bd; - - for (i = 0; i < 2; i++, left += 8) { - h_predictor_32x8(dst, stride, left); - dst += stride << 3; - } -} - -void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - int i; - (void)above; - (void)bd; - - for (i = 0; i < 4; i++, left += 8) { - h_predictor_32x8(dst, stride, left); - dst += stride << 3; - } -} - -// ----------------------------------------------------------------------------- -// DC_TOP, DC_LEFT, DC_128 - -// 4x4 - -static INLINE __m128i dc_sum_4(const uint16_t *ref) { - const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); - const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); - const __m128i a = _mm_add_epi16(_dcba, _xxdc); - return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); -} - -static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, - const __m128i *dc) { - const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); - int i; - for (i = 0; i < 4; ++i, dst += stride) { - _mm_storel_epi64((__m128i *)dst, dc_dup); - } -} - -void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i two = _mm_cvtsi32_si128(2); - const __m128i sum = dc_sum_4(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); - (void)above; - (void)bd; - dc_store_4x4(dst, stride, &dc); -} - -void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i two = _mm_cvtsi32_si128(2); - const __m128i sum = dc_sum_4(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); - (void)left; - (void)bd; - dc_store_4x4(dst, stride, &dc); -} - -void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_4x4(dst, stride, &dc_dup); -} - -// ----------------------------------------------------------------------------- -// 4x8 - -static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, - const __m128i *dc) { - const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); - int i; - for (i = 0; i < 8; ++i, dst += stride) { - _mm_storel_epi64((__m128i *)dst, dc_dup); - } -} - -// Shared with DC 8xh -static INLINE __m128i dc_sum_8(const uint16_t *ref) { - const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); - const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); - const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); - const __m128i a = _mm_add_epi16(_dcba, _xxdc); - - return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); -} - -void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i sum = dc_sum_8(left); - const __m128i four = _mm_cvtsi32_si128(4); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); - (void)above; - (void)bd; - dc_store_4x8(dst, stride, &dc); -} - -void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i two = _mm_cvtsi32_si128(2); - const __m128i sum = dc_sum_4(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); - (void)left; - (void)bd; - dc_store_4x8(dst, stride, &dc); -} - -void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_4x8(dst, stride, &dc_dup); -} - -// ----------------------------------------------------------------------------- -// 8xh - -static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, - const __m128i *dc) { - const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); - const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); - int i; - for (i = 0; i < height; ++i, dst += stride) { - _mm_store_si128((__m128i *)dst, dc_dup); - } -} - -// ----------------------------------------------------------------------------- -// DC_TOP - -static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, - int height, const uint16_t *above) { - const __m128i four = _mm_cvtsi32_si128(4); - const __m128i sum = dc_sum_8(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); - dc_store_8xh(dst, stride, height, &dc); -} - -void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - dc_top_predictor_8xh(dst, stride, 4, above); -} - -void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - dc_top_predictor_8xh(dst, stride, 8, above); -} - -void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - dc_top_predictor_8xh(dst, stride, 16, above); -} - -// ----------------------------------------------------------------------------- -// DC_LEFT - -void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i two = _mm_cvtsi32_si128(2); - const __m128i sum = dc_sum_4(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); - (void)above; - (void)bd; - dc_store_8xh(dst, stride, 4, &dc); -} - -void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i four = _mm_cvtsi32_si128(4); - const __m128i sum = dc_sum_8(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); - (void)above; - (void)bd; - dc_store_8xh(dst, stride, 8, &dc); -} - -// Shared with DC 16xh -static INLINE __m128i dc_sum_16(const uint16_t *ref) { - const __m128i sum_lo = dc_sum_8(ref); - const __m128i sum_hi = dc_sum_8(ref + 8); - return _mm_add_epi16(sum_lo, sum_hi); -} - -void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)above; - (void)bd; - dc_store_8xh(dst, stride, 16, &dc); -} - -// ----------------------------------------------------------------------------- -// DC_128 - -static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, - int height, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - dc_store_8xh(dst, stride, height, &dc_dup); -} - -void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)above; - (void)left; - dc_128_predictor_8xh(dst, stride, 4, bd); -} - -void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)above; - (void)left; - dc_128_predictor_8xh(dst, stride, 8, bd); -} - -void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)above; - (void)left; - dc_128_predictor_8xh(dst, stride, 16, bd); -} - -// ----------------------------------------------------------------------------- -// 16xh - -static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, - const __m128i *dc) { - const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); - const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); - int i; - for (i = 0; i < height; ++i, dst += stride) { - _mm_store_si128((__m128i *)dst, dc_dup); - _mm_store_si128((__m128i *)(dst + 8), dc_dup); - } -} - -// ----------------------------------------------------------------------------- -// DC_LEFT - -void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i four = _mm_cvtsi32_si128(4); - const __m128i sum = dc_sum_8(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); - (void)above; - (void)bd; - dc_store_16xh(dst, stride, 8, &dc); -} - -void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)above; - (void)bd; - dc_store_16xh(dst, stride, 16, &dc); -} - -// Shared with 32xh -static INLINE __m128i dc_sum_32(const uint16_t *ref) { - const __m128i zero = _mm_setzero_si128(); - const __m128i sum_a = dc_sum_16(ref); - const __m128i sum_b = dc_sum_16(ref + 16); - // 12 bit bd will outrange, so expand to 32 bit before adding final total - return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), - _mm_unpacklo_epi16(sum_b, zero)); -} - -void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i sixteen = _mm_cvtsi32_si128(16); - const __m128i sum = dc_sum_32(left); - const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); - (void)above; - (void)bd; - dc_store_16xh(dst, stride, 32, &dc); -} - -// ----------------------------------------------------------------------------- -// DC_TOP - -void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)left; - (void)bd; - dc_store_16xh(dst, stride, 8, &dc); -} - -void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)left; - (void)bd; - dc_store_16xh(dst, stride, 16, &dc); -} - -void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(above); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)left; - (void)bd; - dc_store_16xh(dst, stride, 32, &dc); -} - -// ----------------------------------------------------------------------------- -// DC_128 - -void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_16xh(dst, stride, 8, &dc_dup); -} - -void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_16xh(dst, stride, 16, &dc_dup); -} - -void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_16xh(dst, stride, 32, &dc_dup); -} - -// ----------------------------------------------------------------------------- -// 32xh - -static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, - const __m128i *dc) { - const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); - const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); - int i; - for (i = 0; i < height; ++i, dst += stride) { - _mm_store_si128((__m128i *)dst, dc_dup); - _mm_store_si128((__m128i *)(dst + 8), dc_dup); - _mm_store_si128((__m128i *)(dst + 16), dc_dup); - _mm_store_si128((__m128i *)(dst + 24), dc_dup); - } -} - -void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i eight = _mm_cvtsi32_si128(8); - const __m128i sum = dc_sum_16(left); - const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); - (void)above; - (void)bd; - dc_store_32xh(dst, stride, 16, &dc); -} - -void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i sixteen = _mm_cvtsi32_si128(16); - const __m128i sum = dc_sum_32(left); - const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); - (void)above; - (void)bd; - dc_store_32xh(dst, stride, 32, &dc); -} - -void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i sixteen = _mm_cvtsi32_si128(16); - const __m128i sum = dc_sum_32(above); - const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); - (void)left; - (void)bd; - dc_store_32xh(dst, stride, 16, &dc); -} - -void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_32xh(dst, stride, 16, &dc_dup); -} - -void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i sixteen = _mm_cvtsi32_si128(16); - const __m128i sum = dc_sum_32(above); - const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); - (void)left; - (void)bd; - dc_store_32xh(dst, stride, 32, &dc); -} - -void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); - const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); - (void)above; - (void)left; - dc_store_32xh(dst, stride, 32, &dc_dup); -} - -// ----------------------------------------------------------------------------- -// V_PRED - -void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); - int i; - for (i = 0; i < 2; ++i) { - _mm_storel_epi64((__m128i *)dst, above_u16); - _mm_storel_epi64((__m128i *)(dst + stride), above_u16); - _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); - _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); - dst += stride << 2; - } -} - -void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above_u16 = _mm_load_si128((const __m128i *)above); - _mm_store_si128((__m128i *)dst, above_u16); - _mm_store_si128((__m128i *)(dst + stride), above_u16); - _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); - _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); -} - -void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above_u16 = _mm_load_si128((const __m128i *)above); - int i; - for (i = 0; i < 4; ++i) { - _mm_store_si128((__m128i *)dst, above_u16); - _mm_store_si128((__m128i *)(dst + stride), above_u16); - _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); - _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); - dst += stride << 2; - } -} - -void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); - const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); - int i; - for (i = 0; i < 2; ++i) { - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - } -} - -void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); - const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); - int i; - for (i = 0; i < 8; ++i) { - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - dst += stride; - } -} - -void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); - const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); - int i; - for (i = 0; i < 4; ++i) { - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - _mm_store_si128((__m128i *)(dst + 16), above2_u16); - _mm_store_si128((__m128i *)(dst + 24), above3_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - _mm_store_si128((__m128i *)(dst + 16), above2_u16); - _mm_store_si128((__m128i *)(dst + 24), above3_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - _mm_store_si128((__m128i *)(dst + 16), above2_u16); - _mm_store_si128((__m128i *)(dst + 24), above3_u16); - dst += stride; - _mm_store_si128((__m128i *)dst, above0_u16); - _mm_store_si128((__m128i *)(dst + 8), above1_u16); - _mm_store_si128((__m128i *)(dst + 16), above2_u16); - _mm_store_si128((__m128i *)(dst + 24), above3_u16); - dst += stride; - } -} - -// ----------------------------------------------------------------------------- -// DC_PRED - -void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - const __m128i sum_above = dc_sum_4(above); - const __m128i sum_left = dc_sum_8(left); - const __m128i sum = _mm_add_epi16(sum_above, sum_left); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 >>= 16; - sum32 += 6; - sum32 /= 12; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - int i; - for (i = 0; i < 4; ++i) { - _mm_storel_epi64((__m128i *)dst, row); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row); - dst += stride; - } -} - -void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - const __m128i sum_left = dc_sum_4(left); - const __m128i sum_above = dc_sum_8(above); - const __m128i sum = _mm_add_epi16(sum_above, sum_left); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 >>= 16; - sum32 += 6; - sum32 /= 12; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); -} - -void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - __m128i sum_left = dc_sum_16(left); - __m128i sum_above = dc_sum_8(above); - const __m128i zero = _mm_setzero_si128(); - sum_left = _mm_unpacklo_epi16(sum_left, zero); - sum_above = _mm_unpacklo_epi16(sum_above, zero); - const __m128i sum = _mm_add_epi32(sum_left, sum_above); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 += 12; - sum32 /= 24; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - int i; - for (i = 0; i < 4; ++i) { - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - dst += stride; - } -} - -void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - __m128i sum_left = dc_sum_8(left); - __m128i sum_above = dc_sum_16(above); - const __m128i zero = _mm_setzero_si128(); - sum_left = _mm_unpacklo_epi16(sum_left, zero); - sum_above = _mm_unpacklo_epi16(sum_above, zero); - const __m128i sum = _mm_add_epi32(sum_left, sum_above); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 += 12; - sum32 /= 24; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - int i; - for (i = 0; i < 2; ++i) { - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - } -} - -void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - __m128i sum_left = dc_sum_32(left); - __m128i sum_above = dc_sum_16(above); - const __m128i zero = _mm_setzero_si128(); - sum_above = _mm_unpacklo_epi16(sum_above, zero); - const __m128i sum = _mm_add_epi32(sum_left, sum_above); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 += 24; - sum32 /= 48; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - int i; - for (i = 0; i < 8; ++i) { - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - dst += stride; - } -} - -void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)bd; - __m128i sum_left = dc_sum_16(left); - __m128i sum_above = dc_sum_32(above); - const __m128i zero = _mm_setzero_si128(); - sum_left = _mm_unpacklo_epi16(sum_left, zero); - const __m128i sum = _mm_add_epi32(sum_left, sum_above); - uint32_t sum32 = _mm_cvtsi128_si32(sum); - sum32 += 24; - sum32 /= 48; - const __m128i row = _mm_set1_epi16((uint16_t)sum32); - int i; - for (i = 0; i < 4; ++i) { - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - _mm_store_si128((__m128i *)(dst + 16), row); - _mm_store_si128((__m128i *)(dst + 24), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - _mm_store_si128((__m128i *)(dst + 16), row); - _mm_store_si128((__m128i *)(dst + 24), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - _mm_store_si128((__m128i *)(dst + 16), row); - _mm_store_si128((__m128i *)(dst + 24), row); - dst += stride; - _mm_store_si128((__m128i *)dst, row); - _mm_store_si128((__m128i *)(dst + 8), row); - _mm_store_si128((__m128i *)(dst + 16), row); - _mm_store_si128((__m128i *)(dst + 24), row); - dst += stride; - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm deleted file mode 100644 index 91b3d126c..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm +++ /dev/null @@ -1,259 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 4 dd 16 -pw_32: times 4 dd 32 - -SECTION .text -INIT_XMM sse2 -cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - movq m2, [leftq] - paddw m0, m2 - pshuflw m1, m0, 0xe - paddw m0, m1 - pshuflw m1, m0, 0x1 - paddw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, one - mov oned, 0x00010001 - lea stride3q, [strideq*3] - movd m3, oned - pshufd m3, m3, 0x0 - paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - paddw m0, [GLOBAL(pw_8)] - psrlw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m3, [aboveq+16] - mova m2, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_16)] - psrad m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - paddw m0, m2 - paddw m3, m4 - mova m2, [leftq] - mova m4, [leftq+16] - mova m5, [leftq+32] - mova m6, [leftq+48] - paddw m2, m4 - paddw m5, m6 - paddw m0, m3 - paddw m2, m5 - pxor m1, m1 - paddw m0, m2 - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_32)] - psrad m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16 ], m0 - mova [dstq +32 ], m0 - mova [dstq +48 ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16 ], m0 - mova [dstq+strideq*2+32 ], m0 - mova [dstq+strideq*2+48 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4+16 ], m0 - mova [dstq+strideq*4+32 ], m0 - mova [dstq+strideq*4+48 ], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m0 - mova [dstq+stride3q*2 +32], m0 - mova [dstq+stride3q*2 +48], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m1 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - mova m2, [aboveq+32] - mova m3, [aboveq+48] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq +32], m2 - mova [dstq +48], m3 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*2 +32], m2 - mova [dstq+strideq*2 +48], m3 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+strideq*4 +32], m2 - mova [dstq+strideq*4 +48], m3 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m1 - mova [dstq+stride3q*2 +32], m2 - mova [dstq+stride3q*2 +48], m3 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c deleted file mode 100644 index c954da94e..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/common_avx2.h" -#include "aom_dsp/x86/lpf_common_sse2.h" -#include "aom/aom_integer.h" - -void aom_highbd_lpf_horizontal_14_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, - blimit1, limit1, thresh1, bd); -} - -void aom_highbd_lpf_vertical_14_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); -} - -void aom_highbd_lpf_horizontal_4_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); -} - -void aom_highbd_lpf_horizontal_8_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); -} - -void aom_highbd_lpf_vertical_4_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); -} - -void aom_highbd_lpf_vertical_8_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c deleted file mode 100644 index 097e0778f..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ /dev/null @@ -1,1697 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/lpf_common_sse2.h" - -static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, - __m128i *pixel) { - *pixel = _mm_min_epi16(*pixel, *max); - *pixel = _mm_max_epi16(*pixel, *min); -} - -static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { - return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); -} - -static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, - const uint8_t *t, int bd, __m128i *blt, - __m128i *lt, __m128i *thr, __m128i *t80_out) { - const int shift = bd - 8; - const __m128i zero = _mm_setzero_si128(); - - __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); - *blt = _mm_slli_epi16(x, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); - *lt = _mm_slli_epi16(x, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); - *thr = _mm_slli_epi16(x, shift); - - *t80_out = _mm_set1_epi16(1 << (bd - 1)); -} - -static INLINE void get_limit_dual( - const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, - const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, - int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, - __m128i *t80_out) { - const int shift = bd - 8; - const __m128i zero = _mm_setzero_si128(); - - __m128i x0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); - __m128i x1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); - x0 = _mm_unpacklo_epi64(x0, x1); - *blt_out = _mm_slli_epi16(x0, shift); - - x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); - x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); - x0 = _mm_unpacklo_epi64(x0, x1); - *lt_out = _mm_slli_epi16(x0, shift); - - x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); - x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); - x0 = _mm_unpacklo_epi64(x0, x1); - *thr_out = _mm_slli_epi16(x0, shift); - - *t80_out = _mm_set1_epi16(1 << (bd - 1)); -} - -static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, - __m128i *p, __m128i *q) { - int i; - for (i = 0; i < size; i++) { - p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); - q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); - } -} - -static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, - const __m128i *l, const __m128i *bl, - __m128i *mask) { - __m128i abs_p0q0 = abs_diff16(p[0], q[0]); - __m128i abs_p1q1 = abs_diff16(p[1], q[1]); - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_set1_epi16(0xFFFF); - - __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); - max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); - max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); - - int i; - for (i = 1; i < 4; ++i) { - max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); - max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); - } - max = _mm_subs_epu16(max, *l); - *mask = _mm_cmpeq_epi16(max, zero); // return ~mask -} - -static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, - __m128i *p1p0, __m128i *q1q0, - __m128i *abs_p1p0, __m128i *l, - __m128i *bl, __m128i *t, - __m128i *hev, __m128i *mask) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_set1_epi16(0xFFFF); - __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; - __m128i max, max01, h; - - *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); - *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); - - abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); - abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); - abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); - - abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 - - max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); - max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); - // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); - - *abs_p1p0 = abs_diff16(pq[0], pq[1]); - abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); - max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); - // mask |= (abs(*p1 - *p0) > limit) * -1; - // mask |= (abs(*q1 - *q0) > limit) * -1; - h = _mm_subs_epu16(max01, *t); - - *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); - // replicate for the further "merged variables" usage - *hev = _mm_unpacklo_epi64(*hev, *hev); - - max = _mm_max_epi16(max, max01); - int i; - for (i = 2; i < x; ++i) { - max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); - } - max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); - - max = _mm_subs_epu16(max, *l); - *mask = _mm_cmpeq_epi16(max, zero); // ~mask -} - -static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, - int start, int end, __m128i *flat) { - int i; - __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), - abs_diff16(pq[start + 1], pq[0])); - - for (i = start + 2; i < end; ++i) { - max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); - } - max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); - - __m128i ft; - ft = _mm_subs_epu16(max, *th); - - const __m128i zero = _mm_setzero_si128(); - *flat = _mm_cmpeq_epi16(ft, zero); -} - -static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, - const __m128i *q, int start, int end, - __m128i *flat) { - int i; - __m128i max = - _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); - - for (i = start + 1; i < end; ++i) { - max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); - max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); - } - - __m128i ft; - ft = _mm_subs_epu16(max, *th); - - const __m128i zero = _mm_setzero_si128(); - *flat = _mm_cmpeq_epi16(ft, zero); -} - -static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, - __m128i *flat2, int bd) { - // check the distance 1,2,3 against 0 - __m128i th = _mm_set1_epi16(1); - th = _mm_slli_epi16(th, bd - 8); - flat_mask_internal(&th, pq, 1, 4, flat); - flat_mask_internal(&th, pq, 4, 7, flat2); -} - -static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, - const __m128i *q, __m128i *flat, - __m128i *flat2, int bd) { - // check the distance 1,2,3 against 0 - __m128i th = _mm_set1_epi16(1); - th = _mm_slli_epi16(th, bd - 8); - flat_mask_internal_dual(&th, p, q, 1, 4, flat); - flat_mask_internal_dual(&th, p, q, 4, 7, flat2); -} - -static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, - __m128i *hev, __m128i *mask, - __m128i *qs1qs0, - __m128i *ps1ps0, __m128i *t80, - int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); - const __m128i pmin = _mm_subs_epi16(zero, *t80); - - const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); - __m128i ps1ps0_work, qs1qs0_work, work; - __m128i filt, filter2filter1, filter2filt, filter1filt; - - ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); - qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); - - work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); - pixel_clamp(&pmin, &pmax, &work); - filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); - - filt = _mm_subs_epi16(filt, work); - filt = _mm_subs_epi16(filt, work); - filt = _mm_subs_epi16(filt, work); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, *mask); - filt = _mm_unpacklo_epi64(filt, filt); - - filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ - pixel_clamp(&pmin, &pmax, &filter2filter1); - filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ - - filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); - - // filt >> 1 - filt = _mm_adds_epi16(filt, one); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(*hev, filt); - - filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); - filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); - - qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); - ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); - - pixel_clamp(&pmin, &pmax, &qs1qs0_work); - pixel_clamp(&pmin, &pmax, &ps1ps0_work); - - *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); - *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); -} - -static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, - __m128i *qs, const __m128i *mask, - const __m128i *th, int bd, - __m128i *t80) { - __m128i ps0 = _mm_subs_epi16(p[0], *t80); - __m128i ps1 = _mm_subs_epi16(p[1], *t80); - __m128i qs0 = _mm_subs_epi16(q[0], *t80); - __m128i qs1 = _mm_subs_epi16(q[1], *t80); - const __m128i one = _mm_set1_epi16(1); - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); - - const __m128i zero = _mm_setzero_si128(); - const __m128i pmin = _mm_subs_epi16(zero, *t80); - __m128i filter = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filter); - - // hev_filter - __m128i hev; - const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); - const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); - __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); - h = _mm_subs_epu16(h, *th); - const __m128i ffff = _mm_cmpeq_epi16(h, h); - hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); - - filter = _mm_and_si128(filter, hev); - - const __m128i x = _mm_subs_epi16(qs0, ps0); - filter = _mm_adds_epi16(filter, x); - filter = _mm_adds_epi16(filter, x); - filter = _mm_adds_epi16(filter, x); - pixel_clamp(&pmin, &pmax, &filter); - filter = _mm_and_si128(filter, *mask); - const __m128i t3 = _mm_set1_epi16(3); - const __m128i t4 = _mm_set1_epi16(4); - __m128i filter1 = _mm_adds_epi16(filter, t4); - __m128i filter2 = _mm_adds_epi16(filter, t3); - pixel_clamp(&pmin, &pmax, &filter1); - pixel_clamp(&pmin, &pmax, &filter2); - filter1 = _mm_srai_epi16(filter1, 3); - filter2 = _mm_srai_epi16(filter2, 3); - qs0 = _mm_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &qs0); - ps0 = _mm_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &ps0); - qs[0] = _mm_adds_epi16(qs0, *t80); - ps[0] = _mm_adds_epi16(ps0, *t80); - filter = _mm_adds_epi16(filter1, one); - filter = _mm_srai_epi16(filter, 1); - filter = _mm_andnot_si128(hev, filter); - qs1 = _mm_subs_epi16(qs1, filter); - pixel_clamp(&pmin, &pmax, &qs1); - ps1 = _mm_adds_epi16(ps1, filter); - pixel_clamp(&pmin, &pmax, &ps1); - qs[1] = _mm_adds_epi16(qs1, *t80); - ps[1] = _mm_adds_epi16(ps1, *t80); -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( - __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, - const unsigned char *lt, const unsigned char *thr, int bd) { - int i; - const __m128i zero = _mm_setzero_si128(); - __m128i blimit, limit, thresh; - __m128i t80; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); - - for (i = 0; i < 7; i++) { - pq[i] = _mm_unpacklo_epi64(p[i], q[i]); - } - __m128i mask, hevhev; - __m128i p1p0, q1q0, abs_p1p0; - - highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, - &thresh, &hevhev, &mask); - - __m128i ps0ps1, qs0qs1; - // filter4 - highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); - - __m128i flat, flat2; - highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); - - flat = _mm_and_si128(flat, mask); - flat2 = _mm_and_si128(flat2, flat); - - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi64(flat, flat); - flat2 = _mm_unpacklo_epi64(flat2, flat2); - - // flat and wide flat calculations - - // if flat ==0 then flat2 is zero as well and we don't need any calc below - // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i flat_p[3], flat_q[3], flat_pq[3]; - __m128i flat2_p[6], flat2_q[6]; - __m128i flat2_pq[6]; - __m128i sum_p6, sum_p3; - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - - __m128i work0, work0_0, work0_1, sum_p_0; - __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); - __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); - - __m128i sum_lq = _mm_srli_si128(sum_lp, 8); - __m128i sum_q = _mm_srli_si128(sum_p, 8); - - sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); - sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); - flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); - - sum_p6 = _mm_add_epi16(pq[6], pq[6]); - sum_p3 = _mm_add_epi16(pq[3], pq[3]); - - sum_q = _mm_sub_epi16(sum_p_0, pq[5]); - sum_p = _mm_sub_epi16(sum_p_0, q[5]); - - work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); - work0_1 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - - sum_lq = _mm_sub_epi16(sum_lp, pq[2]); - sum_lp = _mm_sub_epi16(sum_lp, q[2]); - - work0 = _mm_add_epi16(sum_p3, pq[1]); - flat_p[1] = _mm_add_epi16(sum_lp, work0); - flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); - - flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); - flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, pq[1]); - - sum_p3 = _mm_add_epi16(sum_p3, pq[3]); - work0 = _mm_add_epi16(sum_p3, pq[2]); - - flat_p[2] = _mm_add_epi16(sum_lp, work0); - flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); - flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - - int flat2_mask = - (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); - if (flat2_mask) { - flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); - flat2_q[0] = _mm_add_epi16( - sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); - - flat2_p[1] = _mm_add_epi16(sum_p, work0_1); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); - - flat2_pq[0] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, pq[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, pq[3]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, pq[2]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, pq[1]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - } // flat2 - // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // highbd_filter8 - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - - for (i = 0; i < 3; i++) { - pq[i] = _mm_andnot_si128(flat, pq[i]); - flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat_pq[i]); - } - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - if (flat2_mask) { - for (i = 0; i < 6; i++) { - pq[i] = _mm_andnot_si128(flat2, pq[i]); - flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values - } - } - } else { - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - } -} - -void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, - const uint8_t *blt, const uint8_t *lt, - const uint8_t *thr, int bd) { - __m128i p[7], q[7], pq[7]; - int i; - - for (i = 0; i < 7; i++) { - p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); - q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); - } - - highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); - - for (i = 0; i < 6; i++) { - _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); - _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); - } -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( - __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, - const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, - const uint8_t *thr1, int bd) { - __m128i blimit, limit, thresh, t80; - const __m128i zero = _mm_setzero_si128(); - - get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, - &t80); - __m128i mask; - highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); - __m128i flat, flat2; - highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); - - flat = _mm_and_si128(flat, mask); - flat2 = _mm_and_si128(flat2, flat); - __m128i ps[2], qs[2]; - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); - // flat and wide flat calculations - - // if flat ==0 then flat2 is zero as well and we don't need any calc below - // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); - __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); - __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); - sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); - __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); - sum_q = _mm_add_epi16(sum_q, sum_lq); - sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); - sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - flat_p[0] = - _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); - flat_q[0] = - _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); - __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); - __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); - __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); - __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - - sum_q = _mm_sub_epi16(sum_p_0, p[5]); - __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); - - sum_lq = _mm_sub_epi16(sum_lp, p[2]); - sum_lp = _mm_sub_epi16(sum_lp, q[2]); - flat_p[1] = - _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); - flat_q[1] = - _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, p[1]); - sum_p3 = _mm_add_epi16(sum_p3, p[3]); - sum_q3 = _mm_add_epi16(sum_q3, q[3]); - flat_p[2] = - _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); - flat_q[2] = - _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - - int flat2_mask = - (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); - if (flat2_mask) { - flat2_p[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), - _mm_add_epi16(p[1], q[0]))), - 4); - flat2_q[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), - _mm_add_epi16(p[0], q[1]))), - 4); - - flat2_p[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), - 4); - flat2_q[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - flat2_p[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), - 4); - flat2_q[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - flat2_p[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), - 4); - flat2_q[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - flat2_p[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), - 4); - flat2_q[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - flat2_p[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), - 4); - flat2_q[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), - 4); - } - // highbd_filter8 - int i; - for (i = 0; i < 2; i++) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - p[2] = _mm_andnot_si128(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm_and_si128(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm_andnot_si128(flat, q[2]); - flat_q[2] = _mm_and_si128(flat, flat_q[2]); - q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - - for (i = 0; i < 2; i++) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - // highbd_filter16 - if (flat2_mask) { - for (i = 0; i < 6; i++) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); - } - } - } else { - p[0] = ps[0]; - q[0] = qs[0]; - p[1] = ps[1]; - q[1] = qs[1]; - } -} - -void aom_highbd_lpf_horizontal_14_dual_sse2( - uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i p[7], q[7]; - int i; - load_highbd_pixel(s, 7, pitch, p, q); - - highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, - _limit1, _thresh1, bd); - - for (i = 0; i < 6; i++) { - _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_store_si128((__m128i *)(s + i * pitch), q[i]); - } -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( - __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, - __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, - const uint8_t *_limit, const uint8_t *_thresh, int bd) { - __m128i blimit, limit, thresh; - __m128i mask, hev, flat; - __m128i pq[3]; - __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; - __m128i flat_p1p0, flat_q0q1; - - pq[0] = _mm_unpacklo_epi64(*p0, *q0); - pq[1] = _mm_unpacklo_epi64(*p1, *q1); - pq[2] = _mm_unpacklo_epi64(*p2, *q2); - - const __m128i zero = _mm_setzero_si128(); - const __m128i four = _mm_set1_epi16(4); - __m128i t80; - const __m128i one = _mm_set1_epi16(0x1); - - get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); - - highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, - &thresh, &hev, &mask); - - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); - - // flat_mask - flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); - flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); - - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - - flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi64(flat, flat); - - // 5 tap filter - // need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i workp_a, workp_b, workp_c; - __m128i pq0x2_pq1, pq1_pq2; - - // op1 - pq0x2_pq1 = - _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 - pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 - workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), - pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - - workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); - workp_b = - _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 - - // op0 - workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 - workp_a = _mm_add_epi16(workp_a, - workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 - workp_b = _mm_unpacklo_epi64(workp_a, workp_b); - flat_p1p0 = _mm_srli_epi16(workp_b, 3); - - // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), - pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 - workp_b = _mm_srli_si128(pq1_pq2, 8); - workp_a = _mm_add_epi16( - workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 - // workp_shft0 = _mm_srli_epi16(workp_a, 3); - - // oq1 - workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), - pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 - workp_b = _mm_add_epi16(*q2, *q2); - workp_b = - _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - - workp_a = _mm_unpacklo_epi64(workp_a, workp_b); - flat_q0q1 = _mm_srli_epi16(workp_a, 3); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - } -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( - __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, - __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, - const unsigned char *_thresh0, const unsigned char *_blimit1, - const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { - const __m128i zero = _mm_setzero_si128(); - __m128i blimit0, limit0, thresh0; - __m128i t80; - __m128i mask, flat, work; - __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; - __m128i op1, op0, oq0, oq1; - const __m128i four = _mm_set1_epi16(4); - const __m128i one = _mm_set1_epi16(0x1); - const __m128i ffff = _mm_cmpeq_epi16(one, one); - - get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit0, &limit0, &thresh0, &t80); - - abs_p2p1 = abs_diff16(*p2, *p1); - abs_p1p0 = abs_diff16(*p1, *p0); - abs_q1q0 = abs_diff16(*q1, *q0); - abs_q2q1 = abs_diff16(*q2, *q1); - - abs_p0q0 = abs_diff16(*p0, *q0); - abs_p1q1 = abs_diff16(*p1, *q1); - - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); - - mask = _mm_max_epi16(abs_q2q1, mask); - work = _mm_max_epi16(abs_p1p0, abs_q1q0); - mask = _mm_max_epi16(work, mask); - mask = _mm_max_epi16(mask, abs_p2p1); - mask = _mm_subs_epu16(mask, limit0); - mask = _mm_cmpeq_epi16(mask, zero); - - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - // flat_mask - flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); - flat = _mm_max_epi16(flat, work); - - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - - flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); // flat & mask - - // 5 tap filter - // need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; - - // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), - _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 - - workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 - op1 = _mm_srli_epi16(workp_shft0, 3); - - // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 - workp_a = - _mm_add_epi16(workp_a, - workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 - op0 = _mm_srli_epi16(workp_a, 3); - - // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), - *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 - workp_b = _mm_add_epi16(*q1, *q2); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 - oq0 = _mm_srli_epi16(workp_shft0, 3); - - // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), - *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 - workp_b = _mm_add_epi16(*q2, *q2); - workp_shft1 = _mm_add_epi16( - workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 - oq1 = _mm_srli_epi16(workp_shft1, 3); - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - } else { - *q0 = qs[0]; - *q1 = qs[1]; - *p0 = ps[0]; - *p1 = ps[1]; - } -} - -void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; - - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - - highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, - _blimit, _limit, _thresh, bd); - - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); - _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); - _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); -} - -void aom_highbd_lpf_horizontal_6_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i p2, p1, p0, q0, q1, q2; - - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - - highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, - _limit0, _thresh0, _blimit1, _limit1, - _thresh1, bd); - - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( - __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, - __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - const unsigned char *_blimit, const unsigned char *_limit, - const unsigned char *_thresh, int bd) { - const __m128i zero = _mm_setzero_si128(); - __m128i blimit, limit, thresh; - __m128i mask, hev, flat; - __m128i pq[4]; - __m128i p1p0, q1q0, ps1ps0, qs1qs0; - __m128i work_a, opq2, flat_p1p0, flat_q0q1; - - pq[0] = _mm_unpacklo_epi64(*p0, *q0); - pq[1] = _mm_unpacklo_epi64(*p1, *q1); - pq[2] = _mm_unpacklo_epi64(*p2, *q2); - pq[3] = _mm_unpacklo_epi64(*p3, *q3); - - __m128i abs_p1p0; - - const __m128i four = _mm_set1_epi16(4); - __m128i t80; - const __m128i one = _mm_set1_epi16(0x1); - - get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); - - highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, - &thresh, &hev, &mask); - - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); - - // flat_mask4 - flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); - flat = _mm_max_epi16(abs_p1p0, flat); - flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); - - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - - flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi64(flat, flat); - - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; - // Added before shift for rounding part of ROUND_POWER_OF_TWO - - // o*p2 - workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - workp_c = _mm_add_epi16(workp_a, workp_c); - - // o*p1 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); - workp_shft0 = _mm_add_epi16(workp_a, workp_b); - - // o*p0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); - workp_shft1 = _mm_add_epi16(workp_a, workp_b); - - flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); - - // oq0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); - workp_shft0 = _mm_add_epi16(workp_a, workp_b); - - // oq1 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); - workp_shft1 = _mm_add_epi16(workp_a, workp_b); - - flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); - - // oq2 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - workp_a = _mm_add_epi16(workp_a, workp_b); - opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - - work_a = _mm_andnot_si128(flat, pq[2]); - *p2 = _mm_and_si128(flat, opq2); - *p2 = _mm_or_si128(work_a, *p2); - *q2 = _mm_srli_si128(*p2, 8); - } -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( - __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, - __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, - const unsigned char *_limit0, const unsigned char *_thresh0, - const unsigned char *_blimit1, const unsigned char *_limit1, - const unsigned char *_thresh1, int bd) { - __m128i blimit0, limit0, thresh0; - __m128i t80; - __m128i mask, flat; - __m128i work_a, op2, oq2, op1, op0, oq0, oq1; - __m128i abs_p1q1, abs_p0q0, work0, work1, work2; - - const __m128i zero = _mm_setzero_si128(); - const __m128i four = _mm_set1_epi16(4); - const __m128i one = _mm_set1_epi16(0x1); - const __m128i ffff = _mm_cmpeq_epi16(one, one); - - get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit0, &limit0, &thresh0, &t80); - - abs_p0q0 = abs_diff16(*p0, *q0); - abs_p1q1 = abs_diff16(*p1, *q1); - - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; - - // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); - - work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); - work1 = - _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat - work0 = _mm_max_epi16(work0, work1); - work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); - work2 = _mm_max_epi16(work2, work0); - mask = _mm_max_epi16(work2, mask); - - mask = _mm_subs_epu16(mask, limit0); - mask = _mm_cmpeq_epi16(mask, zero); - - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); - flat = _mm_max_epi16(work1, flat); - work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); - flat = _mm_max_epi16(work0, flat); - - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); // flat & mask - - // filter8 need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { - __m128i workp_a, workp_b; - // Added before shift for rounding part of ROUND_POWER_OF_TWO - - // o*p2 - workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // o*p1 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); - op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // o*p0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); - op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // oq0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); - oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // oq1 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); - oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // oq2 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); - - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); - } else { - *q0 = qs[0]; - *q1 = qs[1]; - *p0 = ps[0]; - *p1 = ps[1]; - } -} - -void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0; - - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); - - highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, - &p1p0, _blimit, _limit, _thresh, bd); - - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); - _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); -} - -void aom_highbd_lpf_horizontal_8_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i p2, p1, p0, q0, q1, q2, p3, q3; - - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); - - highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, - _blimit0, _limit0, _thresh0, _blimit1, - _limit1, _thresh1, bd); - - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( - __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, - __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - __m128i blimit, limit, thresh; - __m128i mask, hev; - __m128i p1p0, q1q0; - __m128i pq[2]; - - __m128i abs_p1p0; - - __m128i t80; - get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); - - pq[0] = _mm_unpacklo_epi64(*p0, *q0); - pq[1] = _mm_unpacklo_epi64(*p1, *q1); - - highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, - &thresh, &hev, &mask); - - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); -} - -static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( - __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, - __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i blimit0, limit0, thresh0; - __m128i mask, flat; - __m128i p[2], q[2]; - - const __m128i zero = _mm_setzero_si128(); - __m128i abs_p0q0 = abs_diff16(*q0, *p0); - __m128i abs_p1q1 = abs_diff16(*q1, *p1); - - __m128i abs_p1p0 = abs_diff16(*p1, *p0); - __m128i abs_q1q0 = abs_diff16(*q1, *q0); - - const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); - const __m128i one = _mm_set1_epi16(1); - - __m128i t80; - - get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit0, &limit0, &thresh0, &t80); - - // filter_mask and hev_mask - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); - mask = _mm_max_epi16(flat, mask); - - mask = _mm_subs_epu16(mask, limit0); - mask = _mm_cmpeq_epi16(mask, zero); - - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); -} - -void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - __m128i p1p0, q1q0; - __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - - highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, - _thresh, bd); - - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); - _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); -} - -void aom_highbd_lpf_horizontal_4_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i ps[2], qs[2]; - - highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, - _thresh0, _blimit1, _limit1, _thresh1, bd); - - _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); - _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); - _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); - _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); -} - -void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - __m128i x0, x1, x2, x3, d0, d1, d2, d3; - __m128i p1p0, q1q0; - __m128i p1, q1; - - x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); - x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); - x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - - highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); - - highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, - thresh, bd); - - p1 = _mm_srli_si128(p1p0, 8); - q1 = _mm_srli_si128(q1q0, 8); - - // transpose from 8x4 to 4x8 - highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); - - _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); - _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); - _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); - _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); -} - -void aom_highbd_lpf_vertical_4_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i ps[2], qs[2]; - - x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); - x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); - x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); - x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); - x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); - x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); - - highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, - &d2, &d3); - - highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - - highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, - &d3, &d4, &d5, &d6, &d7); - - _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); - _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); - _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); - _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); - _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); - _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); - _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); - _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); -} - -void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i x3, x2, x1, x0, p0, q0; - __m128i p1p0, q1q0; - - x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); - x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); - x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); - x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); - - highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, - &d6, &d7); - - highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, - limit, thresh, bd); - - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); - - highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); - - _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); - _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); - _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); - _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); -} - -void aom_highbd_lpf_vertical_6_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i p0, q0, p1, q1, p2, q2; - - x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); - x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); - x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); - x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); - x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); - x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); - x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); - x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); - - highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, - &p0, &q0, &q1, &q2, &d6, &d7); - - highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, - _limit0, _thresh0, _blimit1, _limit1, - _thresh1, bd); - - highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, - &d6, &d7); - - _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); - _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); - _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); - _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); - _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); - _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); - _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); - _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); -} - -void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i p2, p1, p0, p3, q0; - __m128i q1q0, p1p0; - - p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); - p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); - p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); - p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); - - highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, - &d6, &d7); - - // Loop filtering - highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, - &p1p0, blimit, limit, thresh, bd); - - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); - - highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, - &d1, &d2, &d3); - - _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); - _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); - _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); - _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); -} - -void aom_highbd_lpf_vertical_8_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - - x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); - x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); - x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); - x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); - x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); - x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); - x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); - x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); - - highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, - &d2, &d3, &d4, &d5, &d6, &d7); - - highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, - blimit0, limit0, thresh0, blimit1, limit1, - thresh1, bd); - - highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, - &x2, &x3, &x4, &x5, &x6, &x7); - - _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); - _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); - _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); - _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); - _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); - _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); - _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); - _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); -} - -void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - __m128i q[7], p[7], pq[7]; - __m128i p6, p5, p4, p3; - __m128i p6_2, p5_2, p4_2, p3_2; - __m128i d0, d1, d2, d3; - __m128i d0_2, d1_2, d2_2, d3_2, d7_2; - - p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); - p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); - p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); - p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); - - highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], - &p[3], &p[2], &p[1], &p[0]); - - p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); - p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); - p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); - p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); - - highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], - &q[3], &q[4], &q[5], &q[6], &d7_2); - - highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); - - highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], - &pq[1], &pq[0], &d0, &d1, &d2, &d3); - - q[0] = _mm_srli_si128(pq[0], 8); - q[1] = _mm_srli_si128(pq[1], 8); - q[2] = _mm_srli_si128(pq[2], 8); - q[3] = _mm_srli_si128(pq[3], 8); - q[4] = _mm_srli_si128(pq[4], 8); - q[5] = _mm_srli_si128(pq[5], 8); - - highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], - &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); - - _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); - _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); - - _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); - _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); - - _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); - _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); - - _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); - _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); -} - -void aom_highbd_lpf_vertical_14_dual_sse2( - uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - __m128i q[7], p[7]; - __m128i p6, p5, p4, p3, p2, p1, p0, q0; - __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; - __m128i d0, d7; - __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; - - p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); - p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); - p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); - p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); - p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); - p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); - p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); - q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); - - highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], - &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); - - p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); - p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); - p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); - p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); - p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); - p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); - p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); - q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); - - highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, - &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], - &q[6], &d7); - - highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, - limit1, thresh1, bd); - - highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], - &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, - &d6_out, &d7_out); - - _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); - _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); - _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); - _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); - _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); - _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); - _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); - _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); - - highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, - &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, - &d6_out, &d7_out); - - _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); - _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); - _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); - _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); - _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); - _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); - _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); - _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c deleted file mode 100644 index b9689202a..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { - const __m128i sign = _mm_srai_epi16(*p, 15); - const __m128i dc = _mm_unpacklo_epi16(*p, sign); - const __m128i ac = _mm_unpackhi_epi16(*p, sign); - *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); -} - -static INLINE void update_qp(__m256i *qp) { - int i; - for (i = 0; i < 5; ++i) { - qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); - } -} - -static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *dequant_ptr, - const int16_t *quant_shift_ptr, __m256i *qp) { - const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); - const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); - const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); - const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); - init_one_qp(&zbin, &qp[0]); - init_one_qp(&round, &qp[1]); - init_one_qp(&quant, &qp[2]); - init_one_qp(&dequant, &qp[3]); - init_one_qp(&quant_shift, &qp[4]); -} - -// Note: -// *x is vector multiplied by *y which is 16 int32_t parallel multiplication -// and right shift 16. The output, 16 int32_t is save in *p. -static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y, - __m256i *p) { - __m256i prod_lo = _mm256_mul_epi32(*x, *y); - __m256i prod_hi = _mm256_srli_epi64(*x, 32); - const __m256i mult_hi = _mm256_srli_epi64(*y, 32); - prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); - - prod_lo = _mm256_srli_epi64(prod_lo, 16); - const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); - prod_lo = _mm256_and_si256(prod_lo, mask); - prod_hi = _mm256_srli_epi64(prod_hi, 16); - - prod_hi = _mm256_slli_epi64(prod_hi, 32); - *p = _mm256_or_si256(prod_lo, prod_hi); -} - -static INLINE void quantize(const __m256i *qp, __m256i *c, - const int16_t *iscan_ptr, tran_low_t *qcoeff, - tran_low_t *dqcoeff, __m256i *eob) { - const __m256i abs = _mm256_abs_epi32(*c); - const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]); - __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]); - flag2 = _mm256_or_si256(flag1, flag2); - const int32_t nzflag = _mm256_movemask_epi8(flag2); - - if (LIKELY(nzflag)) { - __m256i q = _mm256_add_epi32(abs, qp[1]); - __m256i tmp; - mm256_mul_shift_epi32(&q, &qp[2], &tmp); - q = _mm256_add_epi32(tmp, q); - - mm256_mul_shift_epi32(&q, &qp[4], &q); - __m256i dq = _mm256_mullo_epi32(q, qp[3]); - - q = _mm256_sign_epi32(q, *c); - dq = _mm256_sign_epi32(dq, *c); - q = _mm256_and_si256(q, flag2); - dq = _mm256_and_si256(dq, flag2); - - _mm256_storeu_si256((__m256i *)qcoeff, q); - _mm256_storeu_si256((__m256i *)dqcoeff, dq); - - const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); - const __m128i zr = _mm_setzero_si128(); - const __m128i lo = _mm_unpacklo_epi16(isc, zr); - const __m128i hi = _mm_unpackhi_epi16(isc, zr); - const __m256i iscan = - _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); - - const __m256i zero = _mm256_setzero_si256(); - const __m256i zc = _mm256_cmpeq_epi32(dq, zero); - const __m256i nz = _mm256_cmpeq_epi32(zc, zero); - __m256i cur_eob = _mm256_sub_epi32(iscan, nz); - cur_eob = _mm256_and_si256(cur_eob, nz); - *eob = _mm256_max_epi32(cur_eob, *eob); - } else { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff, zero); - _mm256_storeu_si256((__m256i *)dqcoeff, zero); - } -} - -void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - (void)scan; - const unsigned int step = 8; - - __m256i qp[5], coeff; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - - __m256i eob = _mm256_setzero_si256(); - quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - - update_qp(qp); - - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c deleted file mode 100644 index 58e5f98e5..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" -#include "aom_ports/mem.h" - -void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; - __m128i zbins[2]; - __m128i nzbins[2]; - - zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], - (int)zbin_ptr[0]); - zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - (void)scan; - - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } - - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } - } - } - *eob_ptr = eob_i + 1; -} - -void aom_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - __m128i zbins[2]; - __m128i nzbins[2]; - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; - zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); - zbins[1] = _mm_set1_epi32(zbin1_tmp); - - nzbins[0] = _mm_setzero_si128(); - nzbins[1] = _mm_setzero_si128(); - nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); - nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } - *eob_ptr = eob + 1; -} diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm deleted file mode 100644 index e0d22522d..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ /dev/null @@ -1,296 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_4x2x4 5-6 0 - movh m0, [srcq +%2*2] -%if %1 == 1 - movu m4, [ref1q+%3*2] - movu m5, [ref2q+%3*2] - movu m6, [ref3q+%3*2] - movu m7, [ref4q+%3*2] - movhps m0, [srcq +%4*2] - movhps m4, [ref1q+%5*2] - movhps m5, [ref2q+%5*2] - movhps m6, [ref3q+%5*2] - movhps m7, [ref4q+%5*2] - mova m3, m0 - mova m2, m0 - psubusw m3, m4 - psubusw m2, m5 - psubusw m4, m0 - psubusw m5, m0 - por m4, m3 - por m5, m2 - pmaddwd m4, m1 - pmaddwd m5, m1 - mova m3, m0 - mova m2, m0 - psubusw m3, m6 - psubusw m2, m7 - psubusw m6, m0 - psubusw m7, m0 - por m6, m3 - por m7, m2 - pmaddwd m6, m1 - pmaddwd m7, m1 -%else - movu m2, [ref1q+%3*2] - movhps m0, [srcq +%4*2] - movhps m2, [ref1q+%5*2] - mova m3, m0 - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m4, m2 - - movu m2, [ref2q+%3*2] - mova m3, m0 - movhps m2, [ref2q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m5, m2 - - movu m2, [ref3q+%3*2] - mova m3, m0 - movhps m2, [ref3q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m6, m2 - - movu m2, [ref4q+%3*2] - mova m3, m0 - movhps m2, [ref4q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*4] - lea ref1q, [ref1q+ref_strideq*4] - lea ref2q, [ref2q+ref_strideq*4] - lea ref3q, [ref3q+ref_strideq*4] - lea ref4q, [ref4q+ref_strideq*4] -%endif -%endmacro - -; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_8x2x4 5-6 0 - ; 1st 8 px - mova m0, [srcq +%2*2] -%if %1 == 1 - movu m4, [ref1q+%3*2] - movu m5, [ref2q+%3*2] - movu m6, [ref3q+%3*2] - movu m7, [ref4q+%3*2] - mova m3, m0 - mova m2, m0 - psubusw m3, m4 - psubusw m2, m5 - psubusw m4, m0 - psubusw m5, m0 - por m4, m3 - por m5, m2 - pmaddwd m4, m1 - pmaddwd m5, m1 - mova m3, m0 - mova m2, m0 - psubusw m3, m6 - psubusw m2, m7 - psubusw m6, m0 - psubusw m7, m0 - por m6, m3 - por m7, m2 - pmaddwd m6, m1 - pmaddwd m7, m1 -%else - mova m3, m0 - movu m2, [ref1q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m4, m2 - movu m2, [ref2q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m5, m2 - movu m2, [ref3q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m6, m2 - movu m2, [ref4q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endif - - ; 2nd 8 px - mova m0, [srcq +(%4)*2] - mova m3, m0 - movu m2, [ref1q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m4, m2 - movu m2, [ref2q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m5, m2 - movu m2, [ref3q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m6, m2 - movu m2, [ref4q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 -%if %6 == 1 - lea srcq, [srcq +src_strideq*4] - lea ref1q, [ref1q+ref_strideq*4] - lea ref2q, [ref2q+ref_strideq*4] - lea ref3q, [ref3q+ref_strideq*4] - lea ref4q, [ref4q+ref_strideq*4] -%endif - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endmacro - -; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_16x2x4 5-6 0 - HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) - HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 -%endmacro - -; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_32x2x4 5-6 0 - HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) - HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 -%endmacro - -; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_64x2x4 5-6 0 - HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) - HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 -%endmacro - -; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, -; uint8_t *ref[4], int ref_stride, -; uint32_t res[4]); -; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 -%if UNIX64 -cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ - res, ref2, ref3, ref4 -%else -cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ - ref2, ref3, ref4 -%endif - -; set m1 - push srcq - mov srcd, 0x00010001 - movd m1, srcd - pshufd m1, m1, 0x0 - pop srcq - - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov ref2q, [ref1q+gprsize*1] - mov ref3q, [ref1q+gprsize*2] - mov ref4q, [ref1q+gprsize*3] - mov ref1q, [ref1q+gprsize*0] - -; convert byte pointers to short pointers - shl srcq, 1 - shl ref2q, 1 - shl ref3q, 1 - shl ref4q, 1 - shl ref1q, 1 - - HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 - HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 -%endrep - HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 - ; N.B. HIGH_PROCESS outputs dwords (32 bits) - ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM - movhlps m0, m4 - movhlps m1, m5 - movhlps m2, m6 - movhlps m3, m7 - paddd m4, m0 - paddd m5, m1 - paddd m6, m2 - paddd m7, m3 - punpckldq m4, m5 - punpckldq m6, m7 - movhlps m0, m4 - movhlps m1, m6 - paddd m4, m0 - paddd m6, m1 - punpcklqdq m4, m6 - movifnidn r4, r4mp - movu [r4], m4 - RET -%endmacro - - -INIT_XMM sse2 -HIGH_SADNXN4D 64, 64 -HIGH_SADNXN4D 64, 32 -HIGH_SADNXN4D 32, 64 -HIGH_SADNXN4D 32, 32 -HIGH_SADNXN4D 32, 16 -HIGH_SADNXN4D 16, 32 -HIGH_SADNXN4D 16, 16 -HIGH_SADNXN4D 16, 8 -HIGH_SADNXN4D 8, 16 -HIGH_SADNXN4D 8, 8 -HIGH_SADNXN4D 8, 4 -HIGH_SADNXN4D 4, 8 -HIGH_SADNXN4D 4, 4 -HIGH_SADNXN4D 4, 16 -HIGH_SADNXN4D 16, 4 -HIGH_SADNXN4D 8, 32 -HIGH_SADNXN4D 32, 8 -HIGH_SADNXN4D 16, 64 -HIGH_SADNXN4D 64, 16 diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm deleted file mode 100644 index 3398d8a2a..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ /dev/null @@ -1,374 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro HIGH_SAD_FN 4 -%if %4 == 0 -%if %3 == 5 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows -%else ; %3 == 7 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows -%endif ; %3 == 5/7 -%else ; avg -%if %3 == 5 -cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ - second_pred, n_rows -%else ; %3 == 7 -cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ - ref, ref_stride, \ - second_pred, \ - src_stride3, ref_stride3 -%if ARCH_X86_64 -%define n_rowsd r7d -%else ; x86-32 -%define n_rowsd dword r0m -%endif ; x86-32/64 -%endif ; %3 == 5/7 -%endif ; avg/sad - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided -%if %3 == 7 - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] -%endif ; %3 == 7 -; convert src, ref & second_pred to short ptrs (from byte ptrs) - shl srcq, 1 - shl refq, 1 -%if %4 == 1 - shl second_predq, 1 -%endif -%endmacro - -; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD64XN 1-2 0 - HIGH_SAD_FN 64, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 - pxor m6, m6 - -.loop: - ; first half of each row - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+32] - psubusw m5, m3 - psubusw m3, [srcq+32] - por m3, m5 - mova m5, [srcq+48] - psubusw m5, m4 - psubusw m4, [srcq+48] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - paddd m0, m1 - paddd m0, m3 - ; second half of each row - movu m1, [refq+64] - movu m2, [refq+80] - movu m3, [refq+96] - movu m4, [refq+112] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq+64] - psubusw m5, m1 - psubusw m1, [srcq+64] - por m1, m5 - mova m5, [srcq+80] - psubusw m5, m2 - psubusw m2, [srcq+80] - por m2, m5 - mova m5, [srcq+96] - psubusw m5, m3 - psubusw m3, [srcq+96] - por m3, m5 - mova m5, [srcq+112] - psubusw m5, m4 - psubusw m4, [srcq+112] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 -HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 -HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 -HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 -HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 -HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 - -; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD32XN 1-2 0 - HIGH_SAD_FN 32, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+32] - psubusw m5, m3 - psubusw m3, [srcq+32] - por m3, m5 - mova m5, [srcq+48] - psubusw m5, m4 - psubusw m4, [srcq+48] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 -HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 -HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 -HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 -HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 -HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 -HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 -HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 - -; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD16XN 1-2 0 - HIGH_SAD_FN 16, %1, 5, %2 - mov n_rowsd, %1/2 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+ref_strideq*2] - movu m4, [refq+ref_strideq*2+16] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+16] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*2+16] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+src_strideq*2] - psubusw m5, m3 - psubusw m3, [srcq+src_strideq*2] - por m3, m5 - mova m5, [srcq+src_strideq*2+16] - psubusw m5, m4 - psubusw m4, [srcq+src_strideq*2+16] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 -HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 -HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 -HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 -HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 -HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 -HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 -HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 -HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 -HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 - -; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD8XN 1-2 0 - HIGH_SAD_FN 8, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+ref_strideq*2] - movu m3, [refq+ref_strideq*4] - movu m4, [refq+ref_stride3q*2] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+src_strideq*2] - psubusw m5, m2 - psubusw m2, [srcq+src_strideq*2] - por m2, m5 - mova m5, [srcq+src_strideq*4] - psubusw m5, m3 - psubusw m3, [srcq+src_strideq*4] - por m3, m5 - mova m5, [srcq+src_stride3q*2] - psubusw m5, m4 - psubusw m4, [srcq+src_stride3q*2] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*8] - paddd m0, m1 - lea srcq, [srcq+src_strideq*8] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 -HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 -HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 -HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 -HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 -HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 -HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 -HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm deleted file mode 100644 index 61f5b8e86..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ /dev/null @@ -1,1036 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 - -SECTION .text - -; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, -; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, -; int height, unsigned int *sse); -; -; This function returns the SE and stores SSE in the given pointer. - -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse - psubw %3, %4 - psubw %1, %2 - mova %4, %3 ; make copies to manipulate to calc sum - mova %2, %1 ; use originals for calc sse - pmaddwd %3, %3 - paddw %4, %2 - pmaddwd %1, %1 - movhlps %2, %4 - paddd %6, %3 - paddw %4, %2 - pxor %2, %2 - pcmpgtw %2, %4 ; mask for 0 > %4 (sum) - punpcklwd %4, %2 ; sign-extend word to dword - paddd %6, %1 - paddd %5, %4 - -%endmacro - -%macro STORE_AND_RET 0 -%if mmsize == 16 - ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit - ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. - ; We have to sign-extend it before adding the words within the register - ; and outputing to a dword. - movhlps m3, m7 - movhlps m4, m6 - paddd m7, m3 - paddd m6, m4 - pshufd m3, m7, 0x1 - pshufd m4, m6, 0x1 - paddd m7, m3 - paddd m6, m4 - mov r1, ssem ; r1 = unsigned int *sse - movd [r1], m7 ; store sse - movd eax, m6 ; store sum as return value -%endif - RET -%endmacro - -%macro INC_SRC_BY_SRC_STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - add srcq, src_stridemp - add srcq, src_stridemp -%else - lea srcq, [srcq + src_strideq*2] -%endif -%endmacro - -%macro SUBPEL_VARIANCE 1-2 0 ; W -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 - - -%if ARCH_X86_64 - %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq - %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, height, sse - %endif - %define block_height heightd - %define bilin_filter sseq -%else - %if CONFIG_PIC=1 - %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse, \ - g_bilin_filter, g_pw_8 - %define block_height dword heightm - %define sec_str sec_stridemp - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %else - cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 - %define block_height heightd - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %endif - %else - %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, height, sse - %define block_height heightd - %endif - - %define bilin_filter bilin_filter_m - %endif -%endif - - ASSERT %1 <= 16 ; m6 overflows if w > 16 - pxor m6, m6 ; sum - pxor m7, m7 ; sse - -%if %1 < 16 - sar block_height, 1 -%endif -%if %2 == 1 ; avg - shl sec_str, 1 -%endif - - ; FIXME(rbultje) replace by jumptable? - test x_offsetd, x_offsetd - jnz .x_nonzero - ; x_offset == 0 - test y_offsetd, y_offsetd - jnz .x_zero_y_nonzero - - ; x_offset == 0 && y_offset == 0 -.x_zero_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m2, [srcq + 16] - mova m1, [dstq] - mova m3, [dstq + 16] -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m2, [secq+16] -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m2, [srcq + src_strideq*2] - mova m1, [dstq] - mova m3, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_zero_y_zero_loop - STORE_AND_RET - -.x_zero_y_nonzero: - cmp y_offsetd, 8 - jne .x_zero_y_nonhalf - - ; x_offset == 0 && y_offset == 0.5 -.x_zero_y_half_loop: -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+16] - movu m4, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] - pavgw m0, m4 - pavgw m1, m5 -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m1, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*4] - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] - pavgw m0, m1 - pavgw m1, m5 -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_zero_y_half_loop - STORE_AND_RET - -.x_zero_y_nonhalf: - ; x_offset == 0 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 - mova m8, [bilin_filter+y_offsetq] - mova m9, [bilin_filter+y_offsetq+16] - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86-32 or mmx -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0, reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_zero_y_other_loop: -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq + 16] - movu m4, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] - ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can - ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of - ; instructions is the same (5), but it is 1 mul instead of 2, so might be - ; slightly faster because of pmullw latency. It would also cut our rodata - ; tables in half for this function, and save 1-2 registers on x86-64. - pmullw m1, filter_y_a - pmullw m5, filter_y_b - paddw m1, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m1, m5 - paddw m0, m4 - psrlw m1, 4 - psrlw m0, 4 -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m1, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*4] - mova m4, m1 - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] - pmullw m1, filter_y_a - pmullw m5, filter_y_b - paddw m1, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m1, m5 - paddw m0, m4 - psrlw m1, 4 - psrlw m0, 4 -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_zero_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET - -.x_nonzero: - cmp x_offsetd, 8 - jne .x_nonhalf - ; x_offset == 0.5 - test y_offsetd, y_offsetd - jnz .x_half_y_nonzero - - ; x_offset == 0.5 && y_offset == 0 -.x_half_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq + 16] - movu m4, [srcq + 2] - movu m5, [srcq + 18] - mova m2, [dstq] - mova m3, [dstq + 16] - pavgw m0, m4 - pavgw m1, m5 -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m1, [srcq + src_strideq*2] - movu m4, [srcq + 2] - movu m5, [srcq + src_strideq*2 + 2] - mova m2, [dstq] - mova m3, [dstq + dst_strideq*2] - pavgw m0, m4 - pavgw m1, m5 -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_half_y_zero_loop - STORE_AND_RET - -.x_half_y_nonzero: - cmp y_offsetd, 8 - jne .x_half_y_nonhalf - - ; x_offset == 0.5 && y_offset == 0.5 -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+2] - movu m3, [srcq+18] - lea srcq, [srcq + src_strideq*2] - pavgw m0, m2 - pavgw m1, m3 -.x_half_y_half_loop: - movu m2, [srcq] - movu m3, [srcq + 16] - movu m4, [srcq + 2] - movu m5, [srcq + 18] - pavgw m2, m4 - pavgw m3, m5 - pavgw m0, m2 - pavgw m1, m3 - mova m4, [dstq] - mova m5, [dstq + 16] -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m4, m1, m5, m6, m7 - mova m0, m2 - mova m1, m3 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m2, [srcq+2] - lea srcq, [srcq + src_strideq*2] - pavgw m0, m2 -.x_half_y_half_loop: - movu m2, [srcq] - movu m3, [srcq + src_strideq*2] - movu m4, [srcq + 2] - movu m5, [srcq + src_strideq*2 + 2] - pavgw m2, m4 - pavgw m3, m5 - pavgw m0, m2 - pavgw m2, m3 - mova m4, [dstq] - mova m5, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] -%endif - SUM_SSE m0, m4, m2, m5, m6, m7 - mova m0, m3 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_half_y_half_loop - STORE_AND_RET - -.x_half_y_nonhalf: - ; x_offset == 0.5 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 - mova m8, [bilin_filter+y_offsetq] - mova m9, [bilin_filter+y_offsetq+16] - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86_32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0.5. We can reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+2] - movu m3, [srcq+18] - lea srcq, [srcq + src_strideq*2] - pavgw m0, m2 - pavgw m1, m3 -.x_half_y_other_loop: - movu m2, [srcq] - movu m3, [srcq+16] - movu m4, [srcq+2] - movu m5, [srcq+18] - pavgw m2, m4 - pavgw m3, m5 - mova m4, m2 - mova m5, m3 - pmullw m1, filter_y_a - pmullw m3, filter_y_b - paddw m1, filter_rnd - paddw m1, m3 - pmullw m0, filter_y_a - pmullw m2, filter_y_b - paddw m0, filter_rnd - psrlw m1, 4 - paddw m0, m2 - mova m2, [dstq] - psrlw m0, 4 - mova m3, [dstq+16] -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - mova m0, m4 - mova m1, m5 - - lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m2, [srcq+2] - lea srcq, [srcq + src_strideq*2] - pavgw m0, m2 -.x_half_y_other_loop: - movu m2, [srcq] - movu m3, [srcq+src_strideq*2] - movu m4, [srcq+2] - movu m5, [srcq+src_strideq*2+2] - pavgw m2, m4 - pavgw m3, m5 - mova m4, m2 - mova m5, m3 - pmullw m4, filter_y_a - pmullw m3, filter_y_b - paddw m4, filter_rnd - paddw m4, m3 - pmullw m0, filter_y_a - pmullw m2, filter_y_b - paddw m0, filter_rnd - psrlw m4, 4 - paddw m0, m2 - mova m2, [dstq] - psrlw m0, 4 - mova m3, [dstq+dst_strideq*2] -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] -%endif - SUM_SSE m0, m2, m4, m3, m6, m7 - mova m0, m5 - - lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_half_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET - -.x_nonhalf: - test y_offsetd, y_offsetd - jnz .x_nonhalf_y_nonzero - - ; x_offset == bilin interpolation && y_offset == 0 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 - mova m8, [bilin_filter+x_offsetq] - mova m9, [bilin_filter+x_offsetq+16] - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_other_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+2] - movu m3, [srcq+18] - mova m4, [dstq] - mova m5, [dstq+16] - pmullw m1, filter_x_a - pmullw m3, filter_x_b - paddw m1, filter_rnd - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - paddw m1, m3 - paddw m0, m2 - psrlw m1, 4 - psrlw m0, 4 -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m4, m1, m5, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m1, [srcq+src_strideq*2] - movu m2, [srcq+2] - movu m3, [srcq+src_strideq*2+2] - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] - pmullw m1, filter_x_a - pmullw m3, filter_x_b - paddw m1, filter_rnd - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - paddw m1, m3 - paddw m0, m2 - psrlw m1, 4 - psrlw m0, 4 -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] -%endif - SUM_SSE m0, m4, m1, m5, m6, m7 - - lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_other_y_zero_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET - -.x_nonhalf_y_nonzero: - cmp y_offsetd, 8 - jne .x_nonhalf_y_nonhalf - - ; x_offset == bilin interpolation && y_offset == 0.5 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 - mova m8, [bilin_filter+x_offsetq] - mova m9, [bilin_filter+x_offsetq+16] - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0.5. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+2] - movu m3, [srcq+18] - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - pmullw m1, filter_x_a - pmullw m3, filter_x_b - paddw m1, filter_rnd - paddw m0, m2 - paddw m1, m3 - psrlw m0, 4 - psrlw m1, 4 - lea srcq, [srcq+src_strideq*2] -.x_other_y_half_loop: - movu m2, [srcq] - movu m3, [srcq+16] - movu m4, [srcq+2] - movu m5, [srcq+18] - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m2, filter_rnd - pmullw m3, filter_x_a - pmullw m5, filter_x_b - paddw m3, filter_rnd - paddw m2, m4 - paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+16] - psrlw m2, 4 - psrlw m3, 4 - pavgw m0, m2 - pavgw m1, m3 -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m4, m1, m5, m6, m7 - mova m0, m2 - mova m1, m3 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m2, [srcq+2] - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - paddw m0, m2 - psrlw m0, 4 - lea srcq, [srcq+src_strideq*2] -.x_other_y_half_loop: - movu m2, [srcq] - movu m3, [srcq+src_strideq*2] - movu m4, [srcq+2] - movu m5, [srcq+src_strideq*2+2] - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m2, filter_rnd - pmullw m3, filter_x_a - pmullw m5, filter_x_b - paddw m3, filter_rnd - paddw m2, m4 - paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] - psrlw m2, 4 - psrlw m3, 4 - pavgw m0, m2 - pavgw m2, m3 -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] -%endif - SUM_SSE m0, m4, m2, m5, m6, m7 - mova m0, m3 - - lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_other_y_half_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET - -.x_nonhalf_y_nonhalf: -; loading filter - this is same as in 8-bit depth -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 - mova m8, [bilin_filter+x_offsetq] - mova m9, [bilin_filter+x_offsetq+16] - mova m10, [bilin_filter+y_offsetq] - mova m11, [bilin_filter+y_offsetq+16] - mova m12, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_y_a m10 -%define filter_y_b m11 -%define filter_rnd m12 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; In this case, there is NO unused register. Used src_stride register. Later, -; src_stride has to be loaded from stack when it is needed. -%define tempq src_strideq - mov tempq, g_bilin_filterm - add x_offsetq, tempq - add y_offsetq, tempq -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter - add y_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif -; end of load filter - - ; x_offset == bilin interpolation && y_offset == bilin interpolation -%if %1 == 16 - movu m0, [srcq] - movu m2, [srcq+2] - movu m1, [srcq+16] - movu m3, [srcq+18] - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - pmullw m1, filter_x_a - pmullw m3, filter_x_b - paddw m1, filter_rnd - paddw m0, m2 - paddw m1, m3 - psrlw m0, 4 - psrlw m1, 4 - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movu m2, [srcq] - movu m4, [srcq+2] - movu m3, [srcq+16] - movu m5, [srcq+18] - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m2, filter_rnd - pmullw m3, filter_x_a - pmullw m5, filter_x_b - paddw m3, filter_rnd - paddw m2, m4 - paddw m3, m5 - psrlw m2, 4 - psrlw m3, 4 - mova m4, m2 - mova m5, m3 - pmullw m0, filter_y_a - pmullw m2, filter_y_b - paddw m0, filter_rnd - pmullw m1, filter_y_a - pmullw m3, filter_y_b - paddw m0, m2 - paddw m1, filter_rnd - mova m2, [dstq] - paddw m1, m3 - psrlw m0, 4 - psrlw m1, 4 - mova m3, [dstq+16] -%if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] -%endif - SUM_SSE m0, m2, m1, m3, m6, m7 - mova m0, m4 - mova m1, m5 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 2] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%else ; %1 < 16 - movu m0, [srcq] - movu m2, [srcq+2] - pmullw m0, filter_x_a - pmullw m2, filter_x_b - paddw m0, filter_rnd - paddw m0, m2 - psrlw m0, 4 - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movu m2, [srcq] - movu m4, [srcq+2] - INC_SRC_BY_SRC_STRIDE - movu m3, [srcq] - movu m5, [srcq+2] - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m2, filter_rnd - pmullw m3, filter_x_a - pmullw m5, filter_x_b - paddw m3, filter_rnd - paddw m2, m4 - paddw m3, m5 - psrlw m2, 4 - psrlw m3, 4 - mova m4, m2 - mova m5, m3 - pmullw m0, filter_y_a - pmullw m2, filter_y_b - paddw m0, filter_rnd - pmullw m4, filter_y_a - pmullw m3, filter_y_b - paddw m0, m2 - paddw m4, filter_rnd - mova m2, [dstq] - paddw m4, m3 - psrlw m0, 4 - psrlw m4, 4 - mova m3, [dstq+dst_strideq*2] -%if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] -%endif - SUM_SSE m0, m2, m4, m3, m6, m7 - mova m0, m5 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 4] -%if %2 == 1 ; avg - add secq, sec_str -%endif -%endif - dec block_height - jg .x_other_y_other_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET -%endmacro - -INIT_XMM sse2 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM sse2 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c deleted file mode 100644 index 18eb03d12..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> -#include <stddef.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, - ptrdiff_t pred_stride); - -static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - __m128i u0, u1, u2, u3; - __m128i v0, v1, v2, v3; - __m128i x0, x1, x2, x3; - int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); - - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); - - x0 = _mm_sub_epi16(u0, v0); - x1 = _mm_sub_epi16(u1, v1); - x2 = _mm_sub_epi16(u2, v2); - x3 = _mm_sub_epi16(u3, v3); - - _mm_storel_epi64((__m128i *)store_diff, x0); - store_diff = (int64_t *)(diff + 1 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x1); - store_diff = (int64_t *)(diff + 2 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x2); - store_diff = (int64_t *)(diff + 3 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x3); -} - -static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); - - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); - u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); - u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); - u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); - v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); - v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); - v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); - v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); - - x0 = _mm_sub_epi16(u0, v0); - x1 = _mm_sub_epi16(u1, v1); - x2 = _mm_sub_epi16(u2, v2); - x3 = _mm_sub_epi16(u3, v3); - x4 = _mm_sub_epi16(u4, v4); - x5 = _mm_sub_epi16(u5, v5); - x6 = _mm_sub_epi16(u6, v6); - x7 = _mm_sub_epi16(u7, v7); - - _mm_storel_epi64((__m128i *)store_diff, x0); - store_diff = (int64_t *)(diff + 1 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x1); - store_diff = (int64_t *)(diff + 2 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x2); - store_diff = (int64_t *)(diff + 3 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x3); - store_diff = (int64_t *)(diff + 4 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x4); - store_diff = (int64_t *)(diff + 5 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x5); - store_diff = (int64_t *)(diff + 6 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x6); - store_diff = (int64_t *)(diff + 7 * diff_stride); - _mm_storel_epi64((__m128i *)store_diff, x7); -} - -static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - __m128i u0, u1, u2, u3; - __m128i v0, v1, v2, v3; - __m128i x0, x1, x2, x3; - - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); - - x0 = _mm_sub_epi16(u0, v0); - x1 = _mm_sub_epi16(u1, v1); - x2 = _mm_sub_epi16(u2, v2); - x3 = _mm_sub_epi16(u3, v3); - - _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); - _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); - _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); - _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); -} - -static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - - u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); - u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); - u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); - u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); - v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); - v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); - v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); - v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); - - x0 = _mm_sub_epi16(u0, v0); - x1 = _mm_sub_epi16(u1, v1); - x2 = _mm_sub_epi16(u2, v2); - x3 = _mm_sub_epi16(u3, v3); - x4 = _mm_sub_epi16(u4, v4); - x5 = _mm_sub_epi16(u5, v5); - x6 = _mm_sub_epi16(u6, v6); - x7 = _mm_sub_epi16(u7, v7); - - _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); - _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); - _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); - _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); - _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); - _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); - _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); - _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); -} - -#define STACK_V(h, fun) \ - do { \ - fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ - fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ - pred + pred_stride * h, pred_stride); \ - } while (0) - -#define STACK_H(w, fun) \ - do { \ - fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ - fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ - } while (0) - -#define SUBTRACT_FUN(size) \ - static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ - const uint16_t *src, ptrdiff_t src_stride, \ - const uint16_t *pred, ptrdiff_t pred_stride) - -SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } -SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } -SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } -SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } -SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } -SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } -SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } -SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } -SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } -SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } -SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } -SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } -SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } -SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } -SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } -SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } -SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } -SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } - -static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { - if (rows == 4) { - if (cols == 4) return subtract_4x4; - if (cols == 8) return subtract_8x4; - if (cols == 16) return subtract_16x4; - } - if (rows == 8) { - if (cols == 4) return subtract_4x8; - if (cols == 8) return subtract_8x8; - if (cols == 16) return subtract_16x8; - if (cols == 32) return subtract_32x8; - } - if (rows == 16) { - if (cols == 4) return subtract_4x16; - if (cols == 8) return subtract_8x16; - if (cols == 16) return subtract_16x16; - if (cols == 32) return subtract_32x16; - if (cols == 64) return subtract_64x16; - } - if (rows == 32) { - if (cols == 8) return subtract_8x32; - if (cols == 16) return subtract_16x32; - if (cols == 32) return subtract_32x32; - if (cols == 64) return subtract_64x32; - } - if (rows == 64) { - if (cols == 16) return subtract_16x64; - if (cols == 32) return subtract_32x64; - if (cols == 64) return subtract_64x64; - if (cols == 128) return subtract_128x64; - } - if (rows == 128) { - if (cols == 64) return subtract_64x128; - if (cols == 128) return subtract_128x128; - } - assert(0); - return NULL; -} - -void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src8, - ptrdiff_t src_stride, const uint8_t *pred8, - ptrdiff_t pred_stride, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - SubtractWxHFuncType func; - (void)bd; - - func = getSubtractFunc(rows, cols); - func(diff, diff_stride, src, src_stride, pred, pred_stride); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c deleted file mode 100644 index 9b1b4c9de..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <immintrin.h> // AVX2 - -#include "config/aom_dsp_rtcd.h" - -typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { - __m256i v_sum_d = _mm256_setzero_si256(); - __m256i v_sse_d = _mm256_setzero_si256(); - for (int i = 0; i < 8; i += 2) { - const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); - const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); - const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); - const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); - __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); - __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); - v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); - v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); - const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); - const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); - v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); - v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); - src += src_stride * 2; - ref += ref_stride * 2; - } - __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); - __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); - __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); - __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); - __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); - __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); - const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); - const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); - __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); - v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); - *sum = _mm_extract_epi32(v_d, 0); - *sse = _mm_extract_epi32(v_d, 1); -} - -void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { - __m256i v_sum_d = _mm256_setzero_si256(); - __m256i v_sse_d = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - for (int i = 0; i < 16; ++i) { - const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); - const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); - const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); - const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); - v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); - v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); - src += src_stride; - ref += ref_stride; - } - __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); - __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); - __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); - __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); - const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); - const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); - __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); - v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); - *sum = _mm_extract_epi32(v_d, 0); - *sse = _mm_extract_epi32(v_d, 1); -} - -static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, int w, - int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int32_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); -} - -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_avx2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -VAR_FN(128, 128, 16, 14); -VAR_FN(128, 64, 16, 13); -VAR_FN(64, 128, 16, 13); -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); -VAR_FN(16, 4, 16, 6); -VAR_FN(8, 32, 8, 8); -VAR_FN(32, 8, 8, 8); -VAR_FN(16, 64, 16, 10); -VAR_FN(64, 16, 16, 10); - -#undef VAR_FN diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm deleted file mode 100644 index 0d954e178..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm +++ /dev/null @@ -1,318 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -SECTION .text - -;unsigned int aom_highbd_calc16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(aom_highbd_calc16x16var_sse2) PRIVATE -sym(aom_highbd_calc16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - add rax, rax ; source stride in bytes - add rdx, rdx ; recon stride in bytes - - ; Prefetch data - prefetcht0 [rsi] - prefetcht0 [rsi+16] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax+16] - lea rbx, [rsi+rax*2] - prefetcht0 [rbx] - prefetcht0 [rbx+16] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax+16] - - prefetcht0 [rdi] - prefetcht0 [rdi+16] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx+16] - lea rbx, [rdi+rdx*2] - prefetcht0 [rbx] - prefetcht0 [rbx+16] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx+16] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - lea rbx, [rsi+rax*2] - prefetcht0 [rbx] - prefetcht0 [rbx+16] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax+16] - lea rbx, [rdi+rdx*2] - prefetcht0 [rbx] - prefetcht0 [rbx+16] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx+16] - - pxor xmm5, xmm5 - - psubw xmm1, xmm2 - movdqu xmm3, XMMWORD PTR [rsi+16] - paddw xmm5, xmm1 - pmaddwd xmm1, xmm1 - movdqu xmm2, XMMWORD PTR [rdi+16] - paddd xmm6, xmm1 - - psubw xmm3, xmm2 - movdqu xmm1, XMMWORD PTR [rsi+rax] - paddw xmm5, xmm3 - pmaddwd xmm3, xmm3 - movdqu xmm2, XMMWORD PTR [rdi+rdx] - paddd xmm6, xmm3 - - psubw xmm1, xmm2 - movdqu xmm3, XMMWORD PTR [rsi+rax+16] - paddw xmm5, xmm1 - pmaddwd xmm1, xmm1 - movdqu xmm2, XMMWORD PTR [rdi+rdx+16] - paddd xmm6, xmm1 - - psubw xmm3, xmm2 - paddw xmm5, xmm3 - pmaddwd xmm3, xmm3 - paddd xmm6, xmm3 - - movdqa xmm1, xmm5 - movdqa xmm2, xmm5 - pcmpgtw xmm1, xmm0 - pcmpeqw xmm2, xmm0 - por xmm1, xmm2 - pcmpeqw xmm1, xmm0 - movdqa xmm2, xmm5 - punpcklwd xmm5, xmm1 - punpckhwd xmm2, xmm1 - paddd xmm7, xmm5 - paddd xmm7, xmm2 - - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - sub rcx, 2 - jnz .var16loop - - movdqa xmm4, xmm6 - punpckldq xmm6, xmm0 - - punpckhdq xmm4, xmm0 - movdqa xmm5, xmm7 - - paddd xmm6, xmm4 - punpckldq xmm7, xmm0 - - punpckhdq xmm5, xmm0 - paddd xmm7, xmm5 - - movdqa xmm4, xmm6 - movdqa xmm5, xmm7 - - psrldq xmm4, 8 - psrldq xmm5, 8 - - paddd xmm6, xmm4 - paddd xmm7, xmm5 - - mov rdi, arg(4) ; [SSE] - mov rax, arg(5) ; [Sum] - - movd DWORD PTR [rdi], xmm6 - movd DWORD PTR [rax], xmm7 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int aom_highbd_calc8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(aom_highbd_calc8x8var_sse2) PRIVATE -sym(aom_highbd_calc8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - add rax, rax ; source stride in bytes - add rdx, rdx ; recon stride in bytes - - ; Prefetch data - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - lea rbx, [rsi+rax*2] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - lea rbx, [rdi+rdx*2] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 8 - -.var8loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - lea rbx, [rbx+rax*2] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - lea rbx, [rbx+rdx*2] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - - pxor xmm5, xmm5 - - psubw xmm1, xmm2 - movdqu xmm3, XMMWORD PTR [rsi+rax] - paddw xmm5, xmm1 - pmaddwd xmm1, xmm1 - movdqu xmm2, XMMWORD PTR [rdi+rdx] - paddd xmm6, xmm1 - - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - - psubw xmm3, xmm2 - movdqu xmm1, XMMWORD PTR [rsi] - paddw xmm5, xmm3 - pmaddwd xmm3, xmm3 - movdqu xmm2, XMMWORD PTR [rdi] - paddd xmm6, xmm3 - - psubw xmm1, xmm2 - movdqu xmm3, XMMWORD PTR [rsi+rax] - paddw xmm5, xmm1 - pmaddwd xmm1, xmm1 - movdqu xmm2, XMMWORD PTR [rdi+rdx] - paddd xmm6, xmm1 - - psubw xmm3, xmm2 - paddw xmm5, xmm3 - pmaddwd xmm3, xmm3 - paddd xmm6, xmm3 - - movdqa xmm1, xmm5 - movdqa xmm2, xmm5 - pcmpgtw xmm1, xmm0 - pcmpeqw xmm2, xmm0 - por xmm1, xmm2 - pcmpeqw xmm1, xmm0 - movdqa xmm2, xmm5 - punpcklwd xmm5, xmm1 - punpckhwd xmm2, xmm1 - paddd xmm7, xmm5 - paddd xmm7, xmm2 - - lea rsi, [rsi + 2*rax] - lea rdi, [rdi + 2*rdx] - sub rcx, 4 - jnz .var8loop - - movdqa xmm4, xmm6 - punpckldq xmm6, xmm0 - - punpckhdq xmm4, xmm0 - movdqa xmm5, xmm7 - - paddd xmm6, xmm4 - punpckldq xmm7, xmm0 - - punpckhdq xmm5, xmm0 - paddd xmm7, xmm5 - - movdqa xmm4, xmm6 - movdqa xmm5, xmm7 - - psrldq xmm4, 8 - psrldq xmm5, 8 - - paddd xmm6, xmm4 - paddd xmm7, xmm5 - - mov rdi, arg(4) ; [SSE] - mov rax, arg(5) ; [Sum] - - movd DWORD PTR [rdi], xmm6 - movd DWORD PTR [rax], xmm7 - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c deleted file mode 100644 index 47b052abc..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ /dev/null @@ -1,868 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" - -#include "aom_ports/mem.h" - -#include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" -#include "av1/common/reconinter.h" - -typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, int w, - int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, int w, - int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int32_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); -} - -static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, int w, - int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int32_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 4); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); -} - -#define HIGH_GET_VAR(S) \ - void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ - sum); \ - } \ - \ - void aom_highbd_10_get##S##x##S##var_sse2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ - sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 2); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ - } \ - \ - void aom_highbd_12_get##S##x##S##var_sse2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ - sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 4); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ - } - -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); - -#undef HIGH_GET_VAR - -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_8_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ - } \ - \ - uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_12_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -VAR_FN(128, 128, 16, 14); -VAR_FN(128, 64, 16, 13); -VAR_FN(64, 128, 16, 13); -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); -VAR_FN(16, 4, 16, 6); -VAR_FN(8, 32, 8, 8); -VAR_FN(32, 8, 8, 8); -VAR_FN(16, 64, 16, 10); -VAR_FN(64, 16, 16, 10); - -#undef VAR_FN - -unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - aom_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - aom_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - aom_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, - aom_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, - aom_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, - aom_highbd_calc8x8var_sse2, 8); - return *sse; -} - -// The 2 unused parameters are place holders for PIC enabled build. -// These definitions are for functions defined in -// highbd_subpel_variance_impl_sse2.asm -#define DECL(w, opt) \ - int aom_highbd_sub_pixel_variance##w##xh_##opt( \ - const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, int height, \ - unsigned int *sse, void *unused0, void *unused); -#define DECLS(opt) \ - DECL(8, opt); \ - DECL(16, opt) - -DECLS(sse2); - -#undef DECLS -#undef DECL - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ - uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - int64_t var; \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - int start_row; \ - uint32_t sse; \ - int se = 0; \ - int64_t var; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - for (start_row = 0; start_row < h; start_row += 16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ - NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ - &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - } \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); \ - FN(16, 4, 16, 4, 2, opt, (int64_t)); \ - FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t)); \ - FN(64, 16, 16, 6, 4, opt, (int64_t)) - -FNS(sse2); - -#undef FNS -#undef FN - -// The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ - const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused); -#define DECLS(opt) \ - DECL(16, opt) \ - DECL(8, opt) - -DECLS(sse2); -#undef DECL -#undef DECLS - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - int64_t var; \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - int start_row; \ - int64_t var; \ - uint32_t sse; \ - int se = 0; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - for (start_row = 0; start_row < h; start_row += 16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ - w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - } \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); \ - FN(16, 4, 16, 4, 2, opt, (int64_t)); \ - FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t)); \ - FN(64, 16, 16, 6, 4, opt, (int64_t)); - -FNS(sse2); - -#undef FNS -#undef FN - -void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, - const struct AV1Common *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred8, int width, int height, - int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, int bd, - int subpel_search) { - // expect xd == NULL only in tests - if (xd != NULL) { - const MB_MODE_INFO *mi = xd->mi[0]; - const int ref_num = 0; - const int is_intrabc = is_intrabc_block(mi); - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; - const int is_scaled = av1_is_scaled(sf); - - if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); - const struct buf_2d *const dst_buf = &pd->dst; - const struct buf_2d *const pre_buf = - is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = - av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); - return; - } - } - - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - - if (!subpel_x_q3 && !subpel_y_q3) { - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - if (width >= 8) { - int i; - assert(!(width & 7)); - /*Read 8 pixels one row at a time.*/ - for (i = 0; i < height; i++) { - int j; - for (j = 0; j < width; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - _mm_storeu_si128((__m128i *)comp_pred, s0); - comp_pred += 8; - ref += 8; - } - ref += ref_stride - width; - } - } else { - int i; - assert(!(width & 3)); - /*Read 4 pixels two rows at a time.*/ - for (i = 0; i < height; i += 2) { - __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); - __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); - __m128i t0 = _mm_unpacklo_epi64(s0, s1); - _mm_storeu_si128((__m128i *)comp_pred, t0); - comp_pred += 8; - ref += 2 * ref_stride; - } - } - } else if (!subpel_y_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, - NULL, -1, width, height, bd); - } else if (!subpel_x_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, - kernel, 16, width, height, bd); - } else { - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); - const int16_t *const kernel_x = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - const int16_t *const kernel_y = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, - bd); - } -} - -void aom_highbd_comp_avg_upsampled_pred_sse2( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, int subpel_search) { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, - height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd, subpel_search); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); - /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ - assert(!(width * height & 7)); - int n = width * height >> 3; - for (int i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); - comp_pred16 += 8; - pred += 8; - } -} - -static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, - const __m128i *w0, - const __m128i *w1, - const __m128i *r, - void *const result) { - assert(DIST_PRECISION_BITS <= 4); - __m128i mult0 = _mm_mullo_epi16(*p0, *w0); - __m128i mult1 = _mm_mullo_epi16(*p1, *w1); - __m128i sum = _mm_adds_epu16(mult0, mult1); - __m128i round = _mm_adds_epu16(sum, *r); - __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); - - xx_storeu_128(result, shift); -} - -void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, - const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { - int i; - const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; - const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; - const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); - const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); - const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); - const __m128i r = - _mm_set_epi16(round, round, round, round, round, round, round, round); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - - if (width >= 8) { - // Read 8 pixels one row at a time - assert(!(width & 7)); - for (i = 0; i < height; ++i) { - int j; - for (j = 0; j < width; j += 8) { - __m128i p0 = xx_loadu_128(ref); - __m128i p1 = xx_loadu_128(pred); - - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); - - comp_pred += 8; - pred += 8; - ref += 8; - } - ref += ref_stride - width; - } - } else { - // Read 4 pixels two rows at a time - assert(!(width & 3)); - for (i = 0; i < height; i += 2) { - __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); - __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); - __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); - __m128i p1 = xx_loadu_128(pred); - - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); - - comp_pred += 8; - pred += 8; - ref += 2 * ref_stride; - } - } -} - -void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, - int subpel_search) { - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - int n; - int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, - height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd, subpel_search); - assert(!(width * height & 7)); - n = width * height >> 3; - - const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; - const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; - const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); - const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); - const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); - const __m128i r = - _mm_set_epi16(round, round, round, round, round, round, round, round); - - uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); - for (i = 0; i < n; i++) { - __m128i p0 = xx_loadu_128(comp_pred16); - __m128i p1 = xx_loadu_128(pred); - - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); - - comp_pred16 += 8; - pred += 8; - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c deleted file mode 100644 index df5449a9d..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> /* SSE4.1 */ - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/variance.h" -#include "aom_dsp/aom_filter.h" - -static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - uint64_t *sse, int64_t *sum) { - __m128i u0, u1, u2, u3; - __m128i s0, s1, s2, s3; - __m128i t0, t1, x0, y0; - __m128i a0, a1, a2, a3; - __m128i b0, b1, b2, b3; - __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); - - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - - a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); - a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); - a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); - a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); - - b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); - b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); - b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); - b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); - - u0 = _mm_unpacklo_epi16(a0, a1); - u1 = _mm_unpacklo_epi16(a2, a3); - u2 = _mm_unpacklo_epi16(b0, b1); - u3 = _mm_unpacklo_epi16(b2, b3); - - s0 = _mm_sub_epi16(u0, u2); - s1 = _mm_sub_epi16(u1, u3); - - t0 = _mm_madd_epi16(s0, k_one_epi16); - t1 = _mm_madd_epi16(s1, k_one_epi16); - - s2 = _mm_hadd_epi32(t0, t1); - s3 = _mm_hadd_epi32(s2, s2); - y0 = _mm_hadd_epi32(s3, s3); - - t0 = _mm_madd_epi16(s0, s0); - t1 = _mm_madd_epi16(s1, s1); - - s2 = _mm_hadd_epi32(t0, t1); - s3 = _mm_hadd_epi32(s2, s2); - x0 = _mm_hadd_epi32(s3, s3); - - *sse = (uint64_t)_mm_extract_epi32(x0, 0); - *sum = (int64_t)_mm_extract_epi32(y0, 0); -} - -uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int64_t sum, diff; - uint64_t local_sse; - - variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); - *sse = (uint32_t)local_sse; - - diff = (int64_t)*sse - ((sum * sum) >> 4); - return (diff >= 0) ? (uint32_t)diff : 0; -} - -uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int64_t sum, diff; - uint64_t local_sse; - - variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); - *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); - sum = ROUND_POWER_OF_TWO(sum, 2); - - diff = (int64_t)*sse - ((sum * sum) >> 4); - return (diff >= 0) ? (uint32_t)diff : 0; -} - -uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int64_t sum, diff; - uint64_t local_sse; - - variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); - *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); - sum = ROUND_POWER_OF_TWO(sum, 4); - - diff = (int64_t)*sse - ((sum * sum) >> 4); - return diff >= 0 ? (uint32_t)diff : 0; -} - -// Sub-pixel -uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, - sse); -} - -uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, - dst_stride, sse); -} - -uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, - dst_stride, sse); -} - -// Sub-pixel average - -uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse, - const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); - - return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, - sse); -} - -uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse, - const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); - - return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, - dst_stride, sse); -} - -uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, uint32_t *sse, - const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; - uint16_t temp2[4 * 4]; - DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); - - aom_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); - aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); - - return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, - dst_stride, sse); -} diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c deleted file mode 100644 index 1e67d392e..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c +++ /dev/null @@ -1,811 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -static INLINE __m256i dc_sum_64(const uint8_t *ref) { - const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); - const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); - const __m256i zero = _mm256_setzero_si256(); - __m256i y0 = _mm256_sad_epu8(x0, zero); - __m256i y1 = _mm256_sad_epu8(x1, zero); - y0 = _mm256_add_epi64(y0, y1); - __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); - y0 = _mm256_add_epi64(u0, y0); - u0 = _mm256_unpackhi_epi64(y0, y0); - return _mm256_add_epi16(y0, u0); -} - -static INLINE __m256i dc_sum_32(const uint8_t *ref) { - const __m256i x = _mm256_loadu_si256((const __m256i *)ref); - const __m256i zero = _mm256_setzero_si256(); - __m256i y = _mm256_sad_epu8(x, zero); - __m256i u = _mm256_permute2x128_si256(y, y, 1); - y = _mm256_add_epi64(u, y); - u = _mm256_unpackhi_epi64(y, y); - return _mm256_add_epi16(y, u); -} - -static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, - ptrdiff_t stride) { - for (int i = 0; i < height; ++i) { - _mm256_storeu_si256((__m256i *)dst, *r); - dst += stride; - } -} - -static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, - int height, uint8_t *dst, - ptrdiff_t stride) { - for (int i = 0; i < height; ++i) { - _mm256_storeu_si256((__m256i *)dst, *r0); - _mm256_storeu_si256((__m256i *)(dst + 32), *r1); - dst += stride; - } -} - -static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, - ptrdiff_t stride) { - for (int i = 0; i < height; ++i) { - _mm256_storeu_si256((__m256i *)dst, *r); - _mm256_storeu_si256((__m256i *)(dst + 32), *r); - dst += stride; - } -} - -void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i sum_above = dc_sum_32(above); - __m256i sum_left = dc_sum_32(left); - sum_left = _mm256_add_epi16(sum_left, sum_above); - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum_left = _mm256_add_epi16(sum_left, thirtytwo); - sum_left = _mm256_srai_epi16(sum_left, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum_left, zero); - row_store_32xh(&row, 32, dst, stride); -} - -void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_32(above); - (void)left; - - const __m256i sixteen = _mm256_set1_epi16(16); - sum = _mm256_add_epi16(sum, sixteen); - sum = _mm256_srai_epi16(sum, 5); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_32xh(&row, 32, dst, stride); -} - -void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_32(left); - (void)above; - - const __m256i sixteen = _mm256_set1_epi16(16); - sum = _mm256_add_epi16(sum, sixteen); - sum = _mm256_srai_epi16(sum, 5); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_32xh(&row, 32, dst, stride); -} - -void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_32xh(&row, 32, dst, stride); -} - -void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row = _mm256_loadu_si256((const __m256i *)above); - (void)left; - row_store_32xh(&row, 32, dst, stride); -} - -// There are 32 rows togeter. This function does line: -// 0,1,2,3, and 16,17,18,19. The next call would do -// 4,5,6,7, and 20,21,22,23. So 4 times of calling -// would finish 32 rows. -static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, - ptrdiff_t stride) { - __m256i t[4]; - __m256i m = _mm256_setzero_si256(); - const __m256i inc = _mm256_set1_epi8(4); - int i; - - for (i = 0; i < 4; i++) { - t[i] = _mm256_shuffle_epi8(*row, m); - __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); - __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); - _mm256_storeu_si256((__m256i *)dst, r0); - _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); - dst += stride; - m = _mm256_add_epi8(m, inc); - } -} - -void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); - - __m256i u = _mm256_unpacklo_epi8(left_col, left_col); - - __m256i v = _mm256_unpacklo_epi8(u, u); - h_predictor_32x8line(&v, dst, stride); - dst += stride << 2; - - v = _mm256_unpackhi_epi8(u, u); - h_predictor_32x8line(&v, dst, stride); - dst += stride << 2; - - u = _mm256_unpackhi_epi8(left_col, left_col); - - v = _mm256_unpacklo_epi8(u, u); - h_predictor_32x8line(&v, dst, stride); - dst += stride << 2; - - v = _mm256_unpackhi_epi8(u, u); - h_predictor_32x8line(&v, dst, stride); -} - -// ----------------------------------------------------------------------------- -// Rectangle - -// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. -// Use a header file, intrapred_common_x86.h -static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { - __m128i x = _mm_load_si128((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - x = _mm_sad_epu8(x, zero); - const __m128i high = _mm_unpackhi_epi64(x, x); - return _mm_add_epi16(x, high); -} - -static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { - __m128i x0 = _mm_load_si128((__m128i const *)ref); - __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); - const __m128i zero = _mm_setzero_si128(); - x0 = _mm_sad_epu8(x0, zero); - x1 = _mm_sad_epu8(x1, zero); - x0 = _mm_add_epi16(x0, x1); - const __m128i high = _mm_unpackhi_epi64(x0, x0); - return _mm_add_epi16(x0, high); -} - -void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i top_sum = dc_sum_32_sse2(above); - __m128i left_sum = dc_sum_16_sse2(left); - left_sum = _mm_add_epi16(top_sum, left_sum); - uint32_t sum = _mm_cvtsi128_si32(left_sum); - sum += 24; - sum /= 48; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); - row_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i sum_above = dc_sum_32(above); - __m256i sum_left = dc_sum_64(left); - sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); - sum += 48; - sum /= 96; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); - row_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i sum_above = dc_sum_64(above); - __m256i sum_left = dc_sum_64(left); - sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); - sum += 64; - sum /= 128; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); - row_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i sum_above = dc_sum_64(above); - __m256i sum_left = dc_sum_32(left); - sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); - sum += 48; - sum /= 96; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); - row_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i sum_above = dc_sum_64(above); - __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); - sum_left = _mm256_add_epi16(sum_left, sum_above); - uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); - sum += 40; - sum /= 80; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); - row_store_64xh(&row, 16, dst, stride); -} - -void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_32(above); - (void)left; - - const __m256i sixteen = _mm256_set1_epi16(16); - sum = _mm256_add_epi16(sum, sixteen); - sum = _mm256_srai_epi16(sum, 5); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_32(above); - (void)left; - - const __m256i sixteen = _mm256_set1_epi16(16); - sum = _mm256_add_epi16(sum, sixteen); - sum = _mm256_srai_epi16(sum, 5); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_64(above); - (void)left; - - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum = _mm256_add_epi16(sum, thirtytwo); - sum = _mm256_srai_epi16(sum, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_64(above); - (void)left; - - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum = _mm256_add_epi16(sum, thirtytwo); - sum = _mm256_srai_epi16(sum, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_64(above); - (void)left; - - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum = _mm256_add_epi16(sum, thirtytwo); - sum = _mm256_srai_epi16(sum, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_64xh(&row, 16, dst, stride); -} - -void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i sum = dc_sum_16_sse2(left); - (void)above; - - const __m128i eight = _mm_set1_epi16(8); - sum = _mm_add_epi16(sum, eight); - sum = _mm_srai_epi16(sum, 4); - const __m128i zero = _mm_setzero_si128(); - const __m128i r = _mm_shuffle_epi8(sum, zero); - const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); - row_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_64(left); - (void)above; - - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum = _mm256_add_epi16(sum, thirtytwo); - sum = _mm256_srai_epi16(sum, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_64(left); - (void)above; - - const __m256i thirtytwo = _mm256_set1_epi16(32); - sum = _mm256_add_epi16(sum, thirtytwo); - sum = _mm256_srai_epi16(sum, 6); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m256i sum = dc_sum_32(left); - (void)above; - - const __m256i sixteen = _mm256_set1_epi16(16); - sum = _mm256_add_epi16(sum, sixteen); - sum = _mm256_srai_epi16(sum, 5); - const __m256i zero = _mm256_setzero_si256(); - __m256i row = _mm256_shuffle_epi8(sum, zero); - row_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i sum = dc_sum_16_sse2(left); - (void)above; - - const __m128i eight = _mm_set1_epi16(8); - sum = _mm_add_epi16(sum, eight); - sum = _mm_srai_epi16(sum, 4); - const __m128i zero = _mm_setzero_si128(); - const __m128i r = _mm_shuffle_epi8(sum, zero); - const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); - row_store_64xh(&row, 16, dst, stride); -} - -void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m256i row = _mm256_set1_epi8((uint8_t)0x80); - row_store_64xh(&row, 16, dst, stride); -} - -void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row = _mm256_loadu_si256((const __m256i *)above); - (void)left; - row_store_32xh(&row, 16, dst, stride); -} - -void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row = _mm256_loadu_si256((const __m256i *)above); - (void)left; - row_store_32xh(&row, 64, dst, stride); -} - -void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); - const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); - (void)left; - row_store_32x2xh(&row0, &row1, 64, dst, stride); -} - -void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); - const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); - (void)left; - row_store_32x2xh(&row0, &row1, 32, dst, stride); -} - -void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); - const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); - (void)left; - row_store_32x2xh(&row0, &row1, 16, dst, stride); -} - -// ----------------------------------------------------------------------------- -// PAETH_PRED - -// Return 16 16-bit pixels in one row (__m256i) -static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, - const __m256i *topleft) { - const __m256i base = - _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); - - __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); - __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); - __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); - - __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); - mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); - __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); - - pl = _mm256_andnot_si256(mask1, *left); - - ptl = _mm256_and_si256(mask2, *topleft); - pt = _mm256_andnot_si256(mask2, *top); - pt = _mm256_or_si256(pt, ptl); - pt = _mm256_and_si256(mask1, pt); - - return _mm256_or_si256(pt, pl); -} - -// Return 16 8-bit pixels in one row (__m128i) -static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, - const __m256i *topleft) { - const __m256i p0 = paeth_pred(left, top, topleft); - const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); - const __m256i p = _mm256_packus_epi16(p0, p1); - return _mm256_castsi256_si128(p); -} - -static INLINE __m256i get_top_vector(const uint8_t *above) { - const __m128i x = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t0 = _mm_unpacklo_epi8(x, zero); - const __m128i t1 = _mm_unpackhi_epi8(x, zero); - return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); -} - -void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i x = _mm_loadl_epi64((const __m128i *)left); - const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); - const __m256i one = _mm256_set1_epi16(1); - const __m256i top = get_top_vector(above); - - int i; - for (i = 0; i < 8; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} - -static INLINE __m256i get_left_vector(const uint8_t *left) { - const __m128i x = _mm_load_si128((const __m128i *)left); - return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); -} - -void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i l = get_left_vector(left); - const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); - const __m256i one = _mm256_set1_epi16(1); - const __m256i top = get_top_vector(above); - - int i; - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m256i l = get_left_vector(left); - const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); - const __m256i one = _mm256_set1_epi16(1); - const __m256i top = get_top_vector(above); - - int i; - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - - l = get_left_vector(left + 16); - rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); - const __m256i one = _mm256_set1_epi16(1); - const __m256i top = get_top_vector(above); - - for (int j = 0; j < 4; ++j) { - const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); - for (int i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - } -} - -// Return 32 8-bit pixels in one row (__m256i) -static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, - const __m256i *top1, - const __m256i *topleft) { - __m256i p0 = paeth_pred(left, top0, topleft); - __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); - const __m256i x0 = _mm256_packus_epi16(p0, p1); - - p0 = paeth_pred(left, top1, topleft); - p1 = _mm256_permute4x64_epi64(p0, 0xe); - const __m256i x1 = _mm256_packus_epi16(p0, p1); - - return _mm256_permute2x128_si256(x0, x1, 0x20); -} - -void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i l = get_left_vector(left); - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); - const __m256i one = _mm256_set1_epi16(1); - - int i; - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); - - _mm256_storeu_si256((__m256i *)dst, r); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m256i l = get_left_vector(left); - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - __m256i rep = _mm256_set1_epi16(0x8000); - const __m256i one = _mm256_set1_epi16(1); - - int i; - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - - l = get_left_vector(left + 16); - rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - const __m256i one = _mm256_set1_epi16(1); - - int i, j; - for (j = 0; j < 4; ++j) { - const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i t2 = get_top_vector(above + 32); - const __m256i t3 = get_top_vector(above + 48); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - const __m256i one = _mm256_set1_epi16(1); - - int i, j; - for (j = 0; j < 2; ++j) { - const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); - const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i t2 = get_top_vector(above + 32); - const __m256i t3 = get_top_vector(above + 48); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - const __m256i one = _mm256_set1_epi16(1); - - int i, j; - for (j = 0; j < 4; ++j) { - const __m256i l = get_left_vector(left + j * 16); - __m256i rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); - const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m256i t0 = get_top_vector(above); - const __m256i t1 = get_top_vector(above + 16); - const __m256i t2 = get_top_vector(above + 32); - const __m256i t3 = get_top_vector(above + 48); - const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); - const __m256i one = _mm256_set1_epi16(1); - - int i; - const __m256i l = get_left_vector(left); - __m256i rep = _mm256_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - const __m256i l16 = _mm256_shuffle_epi8(l, rep); - - const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); - const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); - const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); - const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - - dst += stride; - rep = _mm256_add_epi16(rep, one); - } -} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c deleted file mode 100644 index 5b2452c8e..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c +++ /dev/null @@ -1,1430 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, - ptrdiff_t stride) { - for (int i = 0; i < height; i += 2) { - *(uint32_t *)dst = dc; - dst += stride; - *(uint32_t *)dst = dc; - dst += stride; - } -} - -static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, - ptrdiff_t stride) { - int i; - for (i = 0; i < height; ++i) { - _mm_storel_epi64((__m128i *)dst, *row); - dst += stride; - } -} - -static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, - ptrdiff_t stride) { - int i; - for (i = 0; i < height; ++i) { - _mm_store_si128((__m128i *)dst, *row); - dst += stride; - } -} - -static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, - ptrdiff_t stride) { - int i; - for (i = 0; i < height; ++i) { - _mm_store_si128((__m128i *)dst, *row); - _mm_store_si128((__m128i *)(dst + 16), *row); - dst += stride; - } -} - -static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, - ptrdiff_t stride) { - for (int i = 0; i < height; ++i) { - _mm_store_si128((__m128i *)dst, *row); - _mm_store_si128((__m128i *)(dst + 16), *row); - _mm_store_si128((__m128i *)(dst + 32), *row); - _mm_store_si128((__m128i *)(dst + 48), *row); - dst += stride; - } -} - -static INLINE __m128i dc_sum_4(const uint8_t *ref) { - __m128i x = _mm_loadl_epi64((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - x = _mm_unpacklo_epi8(x, zero); - return _mm_sad_epu8(x, zero); -} - -static INLINE __m128i dc_sum_8(const uint8_t *ref) { - __m128i x = _mm_loadl_epi64((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - return _mm_sad_epu8(x, zero); -} - -static INLINE __m128i dc_sum_16(const uint8_t *ref) { - __m128i x = _mm_load_si128((__m128i const *)ref); - const __m128i zero = _mm_setzero_si128(); - x = _mm_sad_epu8(x, zero); - const __m128i high = _mm_unpackhi_epi64(x, x); - return _mm_add_epi16(x, high); -} - -static INLINE __m128i dc_sum_32(const uint8_t *ref) { - __m128i x0 = _mm_load_si128((__m128i const *)ref); - __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); - const __m128i zero = _mm_setzero_si128(); - x0 = _mm_sad_epu8(x0, zero); - x1 = _mm_sad_epu8(x1, zero); - x0 = _mm_add_epi16(x0, x1); - const __m128i high = _mm_unpackhi_epi64(x0, x0); - return _mm_add_epi16(x0, high); -} - -static INLINE __m128i dc_sum_64(const uint8_t *ref) { - __m128i x0 = _mm_load_si128((__m128i const *)ref); - __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); - __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); - __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); - const __m128i zero = _mm_setzero_si128(); - x0 = _mm_sad_epu8(x0, zero); - x1 = _mm_sad_epu8(x1, zero); - x2 = _mm_sad_epu8(x2, zero); - x3 = _mm_sad_epu8(x3, zero); - x0 = _mm_add_epi16(x0, x1); - x2 = _mm_add_epi16(x2, x3); - x0 = _mm_add_epi16(x0, x2); - const __m128i high = _mm_unpackhi_epi64(x0, x0); - return _mm_add_epi16(x0, high); -} - -#define DC_MULTIPLIER_1X2 0x5556 -#define DC_MULTIPLIER_1X4 0x3334 - -#define DC_SHIFT2 16 - -static INLINE int divide_using_multiply_shift(int num, int shift1, - int multiplier) { - const int interm = num >> shift1; - return interm * multiplier >> DC_SHIFT2; -} - -// ----------------------------------------------------------------------------- -// DC_PRED - -void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_8(left); - __m128i sum_above = dc_sum_4(above); - sum_above = _mm_add_epi16(sum_left, sum_above); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 6; - sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); - - const __m128i row = _mm_set1_epi8((uint8_t)sum); - const uint32_t pred = _mm_cvtsi128_si32(row); - dc_store_4xh(pred, 8, dst, stride); -} - -void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_16(left); - __m128i sum_above = dc_sum_4(above); - sum_above = _mm_add_epi16(sum_left, sum_above); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 10; - sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); - - const __m128i row = _mm_set1_epi8((uint8_t)sum); - const uint32_t pred = _mm_cvtsi128_si32(row); - dc_store_4xh(pred, 16, dst, stride); -} - -void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_4(left); - __m128i sum_above = dc_sum_8(above); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 6; - sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); - - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_8xh(&row, 4, dst, stride); -} - -void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_16(left); - __m128i sum_above = dc_sum_8(above); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 12; - sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_8xh(&row, 16, dst, stride); -} - -void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_32(left); - __m128i sum_above = dc_sum_8(above); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 20; - sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_8xh(&row, 32, dst, stride); -} - -void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_4(left); - __m128i sum_above = dc_sum_16(above); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 10; - sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_16xh(&row, 4, dst, stride); -} - -void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_8(left); - __m128i sum_above = dc_sum_16(above); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 12; - sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_16xh(&row, 8, dst, stride); -} - -void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_32(left); - __m128i sum_above = dc_sum_16(above); - sum_above = _mm_add_epi16(sum_left, sum_above); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 24; - sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_16xh(&row, 32, dst, stride); -} - -void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i sum_left = dc_sum_64(left); - __m128i sum_above = dc_sum_16(above); - sum_above = _mm_add_epi16(sum_left, sum_above); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 40; - sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_16xh(&row, 64, dst, stride); -} - -void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); - const __m128i sum_left = dc_sum_8(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 20; - sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_32xh(&row, 8, dst, stride); -} - -void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); - const __m128i sum_left = dc_sum_16(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 24; - sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_32(above); - const __m128i sum_left = dc_sum_64(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 48; - sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_64(above); - const __m128i sum_left = dc_sum_64(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 64; - sum /= 128; - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_64(above); - const __m128i sum_left = dc_sum_32(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 48; - sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i sum_above = dc_sum_64(above); - const __m128i sum_left = dc_sum_16(left); - sum_above = _mm_add_epi16(sum_above, sum_left); - - uint32_t sum = _mm_cvtsi128_si32(sum_above); - sum += 40; - sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); - const __m128i row = _mm_set1_epi8((uint8_t)sum); - dc_store_64xh(&row, 16, dst, stride); -} - -// ----------------------------------------------------------------------------- -// DC_TOP - -void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_4(above); - const __m128i two = _mm_set1_epi16((int16_t)2); - sum_above = _mm_add_epi16(sum_above, two); - sum_above = _mm_srai_epi16(sum_above, 2); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - sum_above = _mm_packus_epi16(sum_above, sum_above); - - const uint32_t pred = _mm_cvtsi128_si32(sum_above); - dc_store_4xh(pred, 8, dst, stride); -} - -void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_4(above); - const __m128i two = _mm_set1_epi16((int16_t)2); - sum_above = _mm_add_epi16(sum_above, two); - sum_above = _mm_srai_epi16(sum_above, 2); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - sum_above = _mm_packus_epi16(sum_above, sum_above); - - const uint32_t pred = _mm_cvtsi128_si32(sum_above); - dc_store_4xh(pred, 16, dst, stride); -} - -void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_8(above); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_above = _mm_add_epi16(sum_above, four); - sum_above = _mm_srai_epi16(sum_above, 3); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - const __m128i row = _mm_shufflelo_epi16(sum_above, 0); - dc_store_8xh(&row, 4, dst, stride); -} - -void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_8(above); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_above = _mm_add_epi16(sum_above, four); - sum_above = _mm_srai_epi16(sum_above, 3); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - const __m128i row = _mm_shufflelo_epi16(sum_above, 0); - dc_store_8xh(&row, 16, dst, stride); -} - -void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_8(above); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_above = _mm_add_epi16(sum_above, four); - sum_above = _mm_srai_epi16(sum_above, 3); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - const __m128i row = _mm_shufflelo_epi16(sum_above, 0); - dc_store_8xh(&row, 32, dst, stride); -} - -void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_16(above); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_above = _mm_add_epi16(sum_above, eight); - sum_above = _mm_srai_epi16(sum_above, 4); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_16xh(&row, 4, dst, stride); -} - -void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_16(above); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_above = _mm_add_epi16(sum_above, eight); - sum_above = _mm_srai_epi16(sum_above, 4); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_16xh(&row, 8, dst, stride); -} - -void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_16(above); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_above = _mm_add_epi16(sum_above, eight); - sum_above = _mm_srai_epi16(sum_above, 4); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_16xh(&row, 32, dst, stride); -} - -void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_16(above); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_above = _mm_add_epi16(sum_above, eight); - sum_above = _mm_srai_epi16(sum_above, 4); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_16xh(&row, 64, dst, stride); -} - -void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_32(above); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_above = _mm_add_epi16(sum_above, sixteen); - sum_above = _mm_srai_epi16(sum_above, 5); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_32xh(&row, 8, dst, stride); -} - -void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_32(above); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_above = _mm_add_epi16(sum_above, sixteen); - sum_above = _mm_srai_epi16(sum_above, 5); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_32(above); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_above = _mm_add_epi16(sum_above, sixteen); - sum_above = _mm_srai_epi16(sum_above, 5); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_64(above); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_above = _mm_add_epi16(sum_above, thirtytwo); - sum_above = _mm_srai_epi16(sum_above, 6); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_64(above); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_above = _mm_add_epi16(sum_above, thirtytwo); - sum_above = _mm_srai_epi16(sum_above, 6); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - __m128i sum_above = dc_sum_64(above); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_above = _mm_add_epi16(sum_above, thirtytwo); - sum_above = _mm_srai_epi16(sum_above, 6); - sum_above = _mm_unpacklo_epi8(sum_above, sum_above); - sum_above = _mm_shufflelo_epi16(sum_above, 0); - const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); - dc_store_64xh(&row, 16, dst, stride); -} - -// ----------------------------------------------------------------------------- -// DC_LEFT - -void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_8(left); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_left = _mm_add_epi16(sum_left, four); - sum_left = _mm_srai_epi16(sum_left, 3); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - sum_left = _mm_packus_epi16(sum_left, sum_left); - - const uint32_t pred = _mm_cvtsi128_si32(sum_left); - dc_store_4xh(pred, 8, dst, stride); -} - -void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_16(left); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_left = _mm_add_epi16(sum_left, eight); - sum_left = _mm_srai_epi16(sum_left, 4); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - sum_left = _mm_packus_epi16(sum_left, sum_left); - - const uint32_t pred = _mm_cvtsi128_si32(sum_left); - dc_store_4xh(pred, 16, dst, stride); -} - -void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_4(left); - const __m128i two = _mm_set1_epi16((uint16_t)2); - sum_left = _mm_add_epi16(sum_left, two); - sum_left = _mm_srai_epi16(sum_left, 2); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - const __m128i row = _mm_shufflelo_epi16(sum_left, 0); - dc_store_8xh(&row, 4, dst, stride); -} - -void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_16(left); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_left = _mm_add_epi16(sum_left, eight); - sum_left = _mm_srai_epi16(sum_left, 4); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - const __m128i row = _mm_shufflelo_epi16(sum_left, 0); - dc_store_8xh(&row, 16, dst, stride); -} - -void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_32(left); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_left = _mm_add_epi16(sum_left, sixteen); - sum_left = _mm_srai_epi16(sum_left, 5); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - const __m128i row = _mm_shufflelo_epi16(sum_left, 0); - dc_store_8xh(&row, 32, dst, stride); -} - -void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_4(left); - const __m128i two = _mm_set1_epi16((uint16_t)2); - sum_left = _mm_add_epi16(sum_left, two); - sum_left = _mm_srai_epi16(sum_left, 2); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_16xh(&row, 4, dst, stride); -} - -void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_8(left); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_left = _mm_add_epi16(sum_left, four); - sum_left = _mm_srai_epi16(sum_left, 3); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_16xh(&row, 8, dst, stride); -} - -void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_32(left); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_left = _mm_add_epi16(sum_left, sixteen); - sum_left = _mm_srai_epi16(sum_left, 5); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_16xh(&row, 32, dst, stride); -} - -void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_64(left); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_left = _mm_add_epi16(sum_left, thirtytwo); - sum_left = _mm_srai_epi16(sum_left, 6); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_16xh(&row, 64, dst, stride); -} - -void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_8(left); - const __m128i four = _mm_set1_epi16((uint16_t)4); - sum_left = _mm_add_epi16(sum_left, four); - sum_left = _mm_srai_epi16(sum_left, 3); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_32xh(&row, 8, dst, stride); -} - -void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_16(left); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_left = _mm_add_epi16(sum_left, eight); - sum_left = _mm_srai_epi16(sum_left, 4); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_64(left); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_left = _mm_add_epi16(sum_left, thirtytwo); - sum_left = _mm_srai_epi16(sum_left, 6); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_64(left); - const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); - sum_left = _mm_add_epi16(sum_left, thirtytwo); - sum_left = _mm_srai_epi16(sum_left, 6); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_32(left); - const __m128i sixteen = _mm_set1_epi16((uint16_t)16); - sum_left = _mm_add_epi16(sum_left, sixteen); - sum_left = _mm_srai_epi16(sum_left, 5); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - __m128i sum_left = dc_sum_16(left); - const __m128i eight = _mm_set1_epi16((uint16_t)8); - sum_left = _mm_add_epi16(sum_left, eight); - sum_left = _mm_srai_epi16(sum_left, 4); - sum_left = _mm_unpacklo_epi8(sum_left, sum_left); - sum_left = _mm_shufflelo_epi16(sum_left, 0); - const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); - dc_store_64xh(&row, 16, dst, stride); -} - -// ----------------------------------------------------------------------------- -// DC_128 - -void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const uint32_t pred = 0x80808080; - dc_store_4xh(pred, 8, dst, stride); -} - -void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const uint32_t pred = 0x80808080; - dc_store_4xh(pred, 16, dst, stride); -} - -void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_8xh(&row, 4, dst, stride); -} - -void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_8xh(&row, 16, dst, stride); -} - -void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_8xh(&row, 32, dst, stride); -} - -void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_16xh(&row, 4, dst, stride); -} - -void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_16xh(&row, 8, dst, stride); -} - -void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_16xh(&row, 32, dst, stride); -} - -void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_16xh(&row, 64, dst, stride); -} - -void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_32xh(&row, 8, dst, stride); -} - -void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_32xh(&row, 16, dst, stride); -} - -void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_32xh(&row, 64, dst, stride); -} - -void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_64xh(&row, 64, dst, stride); -} - -void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_64xh(&row, 32, dst, stride); -} - -void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - const __m128i row = _mm_set1_epi8((uint8_t)128); - dc_store_64xh(&row, 16, dst, stride); -} - -// ----------------------------------------------------------------------------- -// V_PRED - -void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint32_t pred = *(uint32_t *)above; - (void)left; - dc_store_4xh(pred, 8, dst, stride); -} - -void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint32_t pred = *(uint32_t *)above; - (void)left; - dc_store_4xh(pred, 16, dst, stride); -} - -void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_loadl_epi64((__m128i const *)above); - (void)left; - dc_store_8xh(&row, 4, dst, stride); -} - -void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_loadl_epi64((__m128i const *)above); - (void)left; - dc_store_8xh(&row, 16, dst, stride); -} - -void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_loadl_epi64((__m128i const *)above); - (void)left; - dc_store_8xh(&row, 32, dst, stride); -} - -void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_load_si128((__m128i const *)above); - (void)left; - dc_store_16xh(&row, 4, dst, stride); -} - -void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_load_si128((__m128i const *)above); - (void)left; - dc_store_16xh(&row, 8, dst, stride); -} - -void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_load_si128((__m128i const *)above); - (void)left; - dc_store_16xh(&row, 32, dst, stride); -} - -void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i row = _mm_load_si128((__m128i const *)above); - (void)left; - dc_store_16xh(&row, 64, dst, stride); -} - -static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int height) { - const __m128i row0 = _mm_load_si128((__m128i const *)above); - const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); - for (int i = 0; i < height; ++i) { - _mm_store_si128((__m128i *)dst, row0); - _mm_store_si128((__m128i *)(dst + 16), row1); - dst += stride; - } -} - -void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_32xh(dst, stride, above, 8); -} - -void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_32xh(dst, stride, above, 16); -} - -void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_32xh(dst, stride, above, 64); -} - -static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int height) { - const __m128i row0 = _mm_load_si128((__m128i const *)above); - const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); - const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); - const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); - for (int i = 0; i < height; ++i) { - _mm_store_si128((__m128i *)dst, row0); - _mm_store_si128((__m128i *)(dst + 16), row1); - _mm_store_si128((__m128i *)(dst + 32), row2); - _mm_store_si128((__m128i *)(dst + 48), row3); - dst += stride; - } -} - -void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_64xh(dst, stride, above, 64); -} - -void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_64xh(dst, stride, above, 32); -} - -void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - v_predictor_64xh(dst, stride, above, 16); -} - -// ----------------------------------------------------------------------------- -// H_PRED - -void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i left_col = _mm_loadl_epi64((__m128i const *)left); - left_col = _mm_unpacklo_epi8(left_col, left_col); - __m128i row0 = _mm_shufflelo_epi16(left_col, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); - dst += stride; - left_col = _mm_unpackhi_epi64(left_col, left_col); - row0 = _mm_shufflelo_epi16(left_col, 0); - row1 = _mm_shufflelo_epi16(left_col, 0x55); - row2 = _mm_shufflelo_epi16(left_col, 0xaa); - row3 = _mm_shufflelo_epi16(left_col, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); -} - -void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - const __m128i left_col = _mm_load_si128((__m128i const *)left); - __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); - __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); - - __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); - dst += stride; - - left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); - row0 = _mm_shufflelo_epi16(left_col_low, 0); - row1 = _mm_shufflelo_epi16(left_col_low, 0x55); - row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); - dst += stride; - - row0 = _mm_shufflelo_epi16(left_col_high, 0); - row1 = _mm_shufflelo_epi16(left_col_high, 0x55); - row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); - dst += stride; - - left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); - row0 = _mm_shufflelo_epi16(left_col_high, 0); - row1 = _mm_shufflelo_epi16(left_col_high, 0x55); - row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - *(uint32_t *)dst = _mm_cvtsi128_si32(row0); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row1); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row2); - dst += stride; - *(uint32_t *)dst = _mm_cvtsi128_si32(row3); -} - -void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i left_col = _mm_loadl_epi64((__m128i const *)left); - left_col = _mm_unpacklo_epi8(left_col, left_col); - __m128i row0 = _mm_shufflelo_epi16(left_col, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int count) { - (void)above; - for (int i = 0; i < count; ++i) { - const __m128i left_col = _mm_load_si128((__m128i const *)left); - __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); - __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); - - __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - dst += stride; - - left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); - row0 = _mm_shufflelo_epi16(left_col_low, 0); - row1 = _mm_shufflelo_epi16(left_col_low, 0x55); - row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - dst += stride; - - row0 = _mm_shufflelo_epi16(left_col_high, 0); - row1 = _mm_shufflelo_epi16(left_col_high, 0x55); - row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - dst += stride; - - left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); - row0 = _mm_shufflelo_epi16(left_col_high, 0); - row1 = _mm_shufflelo_epi16(left_col_high, 0x55); - row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); - row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - dst += stride; - left += 16; - } -} - -void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - h_predictor_8x16xc(dst, stride, above, left, 1); -} - -void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - h_predictor_8x16xc(dst, stride, above, left, 2); -} - -static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, - ptrdiff_t stride) { - int i; - for (i = 0; i < h; ++i) { - _mm_store_si128((__m128i *)dst, row[i]); - dst += stride; - } -} - -static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { - const __m128i u0 = _mm_shufflelo_epi16(*x, 0); - const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); - const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); - const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); - - row[0] = _mm_unpacklo_epi64(u0, u0); - row[1] = _mm_unpacklo_epi64(u1, u1); - row[2] = _mm_unpacklo_epi64(u2, u2); - row[3] = _mm_unpacklo_epi64(u3, u3); -} - -static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { - const __m128i u0 = _mm_shufflehi_epi16(*x, 0); - const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); - const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); - const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); - - row[0] = _mm_unpackhi_epi64(u0, u0); - row[1] = _mm_unpackhi_epi64(u1, u1); - row[2] = _mm_unpackhi_epi64(u2, u2); - row[3] = _mm_unpackhi_epi64(u3, u3); -} - -// Process 16x8, first 4 rows -// Use first 8 bytes of left register: xxxxxxxx33221100 -static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, - ptrdiff_t stride) { - __m128i row[4]; - repeat_low_4pixels(left, row); - h_pred_store_16xh(row, 4, dst, stride); -} - -// Process 16x8, second 4 rows -// Use second 8 bytes of left register: 77665544xxxxxxxx -static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, - ptrdiff_t stride) { - __m128i row[4]; - repeat_high_4pixels(left, row); - h_pred_store_16xh(row, 4, dst, stride); -} - -void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); - const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); -} - -void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); - const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); - dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); -} - -static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *left, int count) { - int i = 0; - do { - const __m128i left_col = _mm_load_si128((const __m128i *)left); - const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p_lo, dst, stride); - dst += stride << 2; - h_prediction_16x8_2(&left_col_8p_lo, dst, stride); - dst += stride << 2; - - const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p_hi, dst, stride); - dst += stride << 2; - h_prediction_16x8_2(&left_col_8p_hi, dst, stride); - dst += stride << 2; - - left += 16; - i++; - } while (i < count); -} - -void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_16xh(dst, stride, left, 2); -} - -void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_16xh(dst, stride, left, 4); -} - -static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, - ptrdiff_t stride) { - int i; - for (i = 0; i < h; ++i) { - _mm_store_si128((__m128i *)dst, row[i]); - _mm_store_si128((__m128i *)(dst + 16), row[i]); - dst += stride; - } -} - -// Process 32x8, first 4 rows -// Use first 8 bytes of left register: xxxxxxxx33221100 -static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, - ptrdiff_t stride) { - __m128i row[4]; - repeat_low_4pixels(left, row); - h_pred_store_32xh(row, 4, dst, stride); -} - -// Process 32x8, second 4 rows -// Use second 8 bytes of left register: 77665544xxxxxxxx -static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, - ptrdiff_t stride) { - __m128i row[4]; - repeat_high_4pixels(left, row); - h_pred_store_32xh(row, 4, dst, stride); -} - -void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i left_col, left_col_8p; - (void)above; - - left_col = _mm_load_si128((const __m128i *)left); - - left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_32x8_1(&left_col_8p, dst, stride); - dst += stride << 2; - h_prediction_32x8_2(&left_col_8p, dst, stride); -} - -void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i left_col, left_col_8p; - (void)above; - - left_col = _mm_load_si128((const __m128i *)left); - - left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_32x8_1(&left_col_8p, dst, stride); - dst += stride << 2; - h_prediction_32x8_2(&left_col_8p, dst, stride); - dst += stride << 2; - - left_col_8p = _mm_unpackhi_epi8(left_col, left_col); - h_prediction_32x8_1(&left_col_8p, dst, stride); - dst += stride << 2; - h_prediction_32x8_2(&left_col_8p, dst, stride); -} - -static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *left, int height) { - int i = height >> 2; - do { - __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); - left4 = _mm_unpacklo_epi8(left4, left4); - left4 = _mm_unpacklo_epi8(left4, left4); - const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); - const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r0); - _mm_store_si128((__m128i *)(dst + stride), r1); - _mm_store_si128((__m128i *)(dst + stride + 16), r1); - const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); - const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); - _mm_store_si128((__m128i *)(dst + stride * 2), r2); - _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); - _mm_store_si128((__m128i *)(dst + stride * 3), r3); - _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); - left += 4; - dst += stride * 4; - } while (--i); -} - -void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_32xh(dst, stride, left, 64); -} - -static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *left, int height) { - int i = height >> 2; - do { - __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); - left4 = _mm_unpacklo_epi8(left4, left4); - left4 = _mm_unpacklo_epi8(left4, left4); - const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); - const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r0); - _mm_store_si128((__m128i *)(dst + 32), r0); - _mm_store_si128((__m128i *)(dst + 48), r0); - _mm_store_si128((__m128i *)(dst + stride), r1); - _mm_store_si128((__m128i *)(dst + stride + 16), r1); - _mm_store_si128((__m128i *)(dst + stride + 32), r1); - _mm_store_si128((__m128i *)(dst + stride + 48), r1); - const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); - const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); - _mm_store_si128((__m128i *)(dst + stride * 2), r2); - _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); - _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); - _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); - _mm_store_si128((__m128i *)(dst + stride * 3), r3); - _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); - _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); - _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); - left += 4; - dst += stride * 4; - } while (--i); -} - -void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_64xh(dst, stride, left, 64); -} - -void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_64xh(dst, stride, left, 32); -} - -void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - h_predictor_64xh(dst, stride, left, 16); -} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm deleted file mode 100644 index 9aece27be..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm +++ /dev/null @@ -1,625 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pb_1: times 16 db 1 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -dc_128: times 16 db 128 -pw2_4: times 8 dw 2 -pw2_8: times 8 dw 4 -pw2_16: times 8 dw 8 -pw2_32: times 8 dw 16 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM sse2 -cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - movd m2, [leftq] - movd m0, [aboveq] - pxor m1, m1 - punpckldq m0, m2 - psadbw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [leftq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [aboveq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - movq m2, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - paddw m0, [GLOBAL(pw_8)] - psraw m0, 4 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movd m0, [GLOBAL(dc_128)] - movd [dstq ], m0 - movd [dstq+strideq ], m0 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq m0, [GLOBAL(dc_128)] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_16)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - - -INIT_XMM sse2 -cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - - -INIT_XMM sse2 -cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - psadbw m3, m1 - psadbw m4, m1 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_32)] - psraw m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - mova m2, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above - movd m0, [aboveq] - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m1 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left - movifnidn leftq, leftmp - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 - pshufd m1, m0, 0x1 - movd [dstq ], m0 - movd [dstq+strideq], m1 - pshufd m2, m0, 0x2 - lea dstq, [dstq+strideq*2] - pshufd m3, m0, 0x3 - movd [dstq ], m2 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -2 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] - movq m0, [leftq ] - punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 -.loop: - pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 - pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 - movq [dstq ], m1 - movq [dstq+strideq], m2 - pshuflw m1, m0, 0xaa - pshuflw m2, m0, 0xff - movq [dstq+strideq*2], m1 - movq [dstq+stride3q ], m2 - pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 - inc lineq - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -4 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+strideq ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2], m1 - mova [dstq+stride3q ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -8 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+16 ], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16 ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2 ], m1 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c deleted file mode 100644 index 807ed1770..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c +++ /dev/null @@ -1,1692 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/intrapred_common.h" - -// ----------------------------------------------------------------------------- -// PAETH_PRED - -// Return 8 16-bit pixels in one row -static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, - const __m128i *topleft) { - const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); - - __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); - __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); - __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); - - __m128i mask1 = _mm_cmpgt_epi16(pl, pt); - mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); - __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); - - pl = _mm_andnot_si128(mask1, *left); - - ptl = _mm_and_si128(mask2, *topleft); - pt = _mm_andnot_si128(mask2, *top); - pt = _mm_or_si128(pt, ptl); - pt = _mm_and_si128(mask1, pt); - - return _mm_or_si128(pl, pt); -} - -void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_loadl_epi64((const __m128i *)left); - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 4; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_loadl_epi64((const __m128i *)left); - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 8; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_load_si128((const __m128i *)left); - const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - for (int i = 0; i < 16; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_loadl_epi64((const __m128i *)left); - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 4; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_loadl_epi64((const __m128i *)left); - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 8; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_load_si128((const __m128i *)left); - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 16; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i t = _mm_loadl_epi64((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i t16 = _mm_unpacklo_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - - for (int j = 0; j < 2; ++j) { - const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); - for (int i = 0; i < 16; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); - - _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - } -} - -// Return 16 8-bit pixels in one row -static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, - const __m128i *top1, - const __m128i *topleft) { - const __m128i p0 = paeth_8x1_pred(left, top0, topleft); - const __m128i p1 = paeth_8x1_pred(left, top1, topleft); - return _mm_packus_epi16(p0, p1); -} - -void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); - const __m128i t = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i top0 = _mm_unpacklo_epi8(t, zero); - const __m128i top1 = _mm_unpackhi_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - for (int i = 0; i < 4; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i l = _mm_loadl_epi64((const __m128i *)left); - const __m128i t = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i top0 = _mm_unpacklo_epi8(t, zero); - const __m128i top1 = _mm_unpackhi_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 8; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i l = _mm_load_si128((const __m128i *)left); - const __m128i t = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i top0 = _mm_unpacklo_epi8(t, zero); - const __m128i top1 = _mm_unpackhi_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - - int i; - for (i = 0; i < 16; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i l = _mm_load_si128((const __m128i *)left); - const __m128i t = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i top0 = _mm_unpacklo_epi8(t, zero); - const __m128i top1 = _mm_unpackhi_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - __m128i l16; - - int i; - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - - l = _mm_load_si128((const __m128i *)(left + 16)); - rep = _mm_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i t = _mm_load_si128((const __m128i *)above); - const __m128i zero = _mm_setzero_si128(); - const __m128i top0 = _mm_unpacklo_epi8(t, zero); - const __m128i top1 = _mm_unpackhi_epi8(t, zero); - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - - for (int j = 0; j < 4; ++j) { - const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); - for (int i = 0; i < 16; ++i) { - const __m128i l16 = _mm_shuffle_epi8(l, rep); - const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); - _mm_store_si128((__m128i *)dst, row); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - const __m128i l = _mm_loadl_epi64((const __m128i *)left); - __m128i l16; - - for (int i = 0; i < 8; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - - _mm_store_si128((__m128i *)dst, r32l); - _mm_store_si128((__m128i *)(dst + 16), r32h); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - __m128i l = _mm_load_si128((const __m128i *)left); - __m128i l16; - - int i; - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - - _mm_store_si128((__m128i *)dst, r32l); - _mm_store_si128((__m128i *)(dst + 16), r32h); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - __m128i rep = _mm_set1_epi16(0x8000); - const __m128i one = _mm_set1_epi16(1); - __m128i l = _mm_load_si128((const __m128i *)left); - __m128i l16; - - int i; - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - - _mm_store_si128((__m128i *)dst, r32l); - _mm_store_si128((__m128i *)(dst + 16), r32h); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - - rep = _mm_set1_epi16(0x8000); - l = _mm_load_si128((const __m128i *)(left + 16)); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - - _mm_store_si128((__m128i *)dst, r32l); - _mm_store_si128((__m128i *)(dst + 16), r32h); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - __m128i l16; - - int i, j; - for (j = 0; j < 4; ++j) { - const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - - _mm_store_si128((__m128i *)dst, r32l); - _mm_store_si128((__m128i *)(dst + 16), r32h); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); - const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - const __m128i cl = _mm_unpacklo_epi8(c, zero); - const __m128i ch = _mm_unpackhi_epi8(c, zero); - const __m128i dl = _mm_unpacklo_epi8(d, zero); - const __m128i dh = _mm_unpackhi_epi8(d, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - __m128i l16; - - int i, j; - for (j = 0; j < 2; ++j) { - const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); - const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); - const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - const __m128i cl = _mm_unpacklo_epi8(c, zero); - const __m128i ch = _mm_unpackhi_epi8(c, zero); - const __m128i dl = _mm_unpacklo_epi8(d, zero); - const __m128i dh = _mm_unpackhi_epi8(d, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - __m128i l16; - - int i, j; - for (j = 0; j < 4; ++j) { - const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); - __m128i rep = _mm_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); - const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - dst += stride; - rep = _mm_add_epi16(rep, one); - } - } -} - -void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - const __m128i a = _mm_load_si128((const __m128i *)above); - const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); - const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); - const __m128i zero = _mm_setzero_si128(); - const __m128i al = _mm_unpacklo_epi8(a, zero); - const __m128i ah = _mm_unpackhi_epi8(a, zero); - const __m128i bl = _mm_unpacklo_epi8(b, zero); - const __m128i bh = _mm_unpackhi_epi8(b, zero); - const __m128i cl = _mm_unpacklo_epi8(c, zero); - const __m128i ch = _mm_unpackhi_epi8(c, zero); - const __m128i dl = _mm_unpacklo_epi8(d, zero); - const __m128i dh = _mm_unpackhi_epi8(d, zero); - - const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); - const __m128i one = _mm_set1_epi16(1); - __m128i l16; - - int i; - const __m128i l = _mm_load_si128((const __m128i *)left); - __m128i rep = _mm_set1_epi16(0x8000); - for (i = 0; i < 16; ++i) { - l16 = _mm_shuffle_epi8(l, rep); - const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); - const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); - const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); - const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); - - _mm_store_si128((__m128i *)dst, r0); - _mm_store_si128((__m128i *)(dst + 16), r1); - _mm_store_si128((__m128i *)(dst + 32), r2); - _mm_store_si128((__m128i *)(dst + 48), r3); - dst += stride; - rep = _mm_add_epi16(rep, one); - } -} - -// ----------------------------------------------------------------------------- -// SMOOTH_PRED - -// pixels[0]: above and below_pred interleave vector -// pixels[1]: left vector -// pixels[2]: right_pred vector -static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); - if (height == 4) - pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); - else if (height == 8) - pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); - else - pixels[1] = _mm_loadu_si128(((const __m128i *)left)); - - pixels[2] = _mm_set1_epi16((uint16_t)above[3]); - - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - const __m128i zero = _mm_setzero_si128(); - d = _mm_unpacklo_epi8(d, zero); - pixels[0] = _mm_unpacklo_epi16(d, bp); -} - -// weight_h[0]: weight_h vector -// weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], second half for height = 16 only -// weight_h[3]: same as [1], second half for height = 16 only -// weight_w[0]: weights_w and scale - weights_w interleave vector -static INLINE void load_weight_w4(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { - const __m128i zero = _mm_setzero_si128(); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); - weight_h[0] = _mm_unpacklo_epi8(t, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - - if (height == 8) { - const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); - weight_h[0] = _mm_unpacklo_epi8(weight, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - } else if (height == 16) { - const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); - weight_h[0] = _mm_unpacklo_epi8(weight, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(weight, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - } -} - -static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, - const __m128i *ww, int h, uint8_t *dst, - ptrdiff_t stride, int second_half) { - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - const __m128i one = _mm_set1_epi16(1); - const __m128i inc = _mm_set1_epi16(0x202); - const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); - __m128i d = _mm_set1_epi16(0x100); - - for (int i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); - __m128i s = _mm_madd_epi16(pixel[0], wh_sc); - - __m128i b = _mm_shuffle_epi8(pixel[1], rep); - b = _mm_unpacklo_epi16(b, pixel[2]); - __m128i sum = _mm_madd_epi16(b, ww[0]); - - sum = _mm_add_epi32(s, sum); - sum = _mm_add_epi32(sum, round); - sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); - - sum = _mm_shuffle_epi8(sum, gat); - *(uint32_t *)dst = _mm_cvtsi128_si32(sum); - dst += stride; - - rep = _mm_add_epi16(rep, one); - d = _mm_add_epi16(d, inc); - } -} - -void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i pixels[3]; - load_pixel_w4(above, left, 4, pixels); - - __m128i wh[4], ww[2]; - load_weight_w4(sm_weight_arrays, 4, wh, ww); - - smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); -} - -void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i pixels[3]; - load_pixel_w4(above, left, 8, pixels); - - __m128i wh[4], ww[2]; - load_weight_w4(sm_weight_arrays, 8, wh, ww); - - smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); -} - -void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[3]; - load_pixel_w4(above, left, 16, pixels); - - __m128i wh[4], ww[2]; - load_weight_w4(sm_weight_arrays, 16, wh, ww); - - smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); -} - -// pixels[0]: above and below_pred interleave vector, first half -// pixels[1]: above and below_pred interleave vector, second half -// pixels[2]: left vector -// pixels[3]: right_pred vector -// pixels[4]: above and below_pred interleave vector, first half -// pixels[5]: above and below_pred interleave vector, second half -// pixels[6]: left vector + 16 -// pixels[7]: right_pred vector -static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - const __m128i zero = _mm_setzero_si128(); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - __m128i d = _mm_loadl_epi64((const __m128i *)above); - d = _mm_unpacklo_epi8(d, zero); - pixels[0] = _mm_unpacklo_epi16(d, bp); - pixels[1] = _mm_unpackhi_epi16(d, bp); - - pixels[3] = _mm_set1_epi16((uint16_t)above[7]); - - if (height == 4) { - pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); - } else if (height == 8) { - pixels[2] = _mm_loadl_epi64((const __m128i *)left); - } else if (height == 16) { - pixels[2] = _mm_load_si128((const __m128i *)left); - } else { - pixels[2] = _mm_load_si128((const __m128i *)left); - pixels[4] = pixels[0]; - pixels[5] = pixels[1]; - pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); - pixels[7] = pixels[3]; - } -} - -// weight_h[0]: weight_h vector -// weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], offset 8 -// weight_h[3]: same as [1], offset 8 -// weight_h[4]: same as [0], offset 16 -// weight_h[5]: same as [1], offset 16 -// weight_h[6]: same as [0], offset 24 -// weight_h[7]: same as [1], offset 24 -// weight_w[0]: weights_w and scale - weights_w interleave vector, first half -// weight_w[1]: weights_w and scale - weights_w interleave vector, second half -static INLINE void load_weight_w8(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { - const __m128i zero = _mm_setzero_si128(); - const int we_offset = height < 8 ? 4 : 8; - __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); - weight_h[0] = _mm_unpacklo_epi8(we, zero); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - - if (height == 4) { - we = _mm_srli_si128(we, 4); - __m128i tmp1 = _mm_unpacklo_epi8(we, zero); - __m128i tmp2 = _mm_sub_epi16(d, tmp1); - weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); - weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); - } else { - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); - } - - if (height == 16) { - we = _mm_loadu_si128((const __m128i *)&weight_array[16]); - weight_h[0] = _mm_unpacklo_epi8(we, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(we, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - } else if (height == 32) { - const __m128i weight_lo = - _mm_loadu_si128((const __m128i *)&weight_array[32]); - weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - const __m128i weight_hi = - _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); - weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); - weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); - weight_h[7] = _mm_sub_epi16(d, weight_h[6]); - } -} - -static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, - const __m128i *ww, int h, uint8_t *dst, - ptrdiff_t stride, int second_half) { - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - const __m128i one = _mm_set1_epi16(1); - const __m128i inc = _mm_set1_epi16(0x202); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); - __m128i d = _mm_set1_epi16(0x100); - - int i; - for (i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); - __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); - __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); - - __m128i b = _mm_shuffle_epi8(pixels[2], rep); - b = _mm_unpacklo_epi16(b, pixels[3]); - __m128i sum0 = _mm_madd_epi16(b, ww[0]); - __m128i sum1 = _mm_madd_epi16(b, ww[1]); - - s0 = _mm_add_epi32(s0, sum0); - s0 = _mm_add_epi32(s0, round); - s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); - - s1 = _mm_add_epi32(s1, sum1); - s1 = _mm_add_epi32(s1, round); - s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); - - sum0 = _mm_packus_epi16(s0, s1); - sum0 = _mm_shuffle_epi8(sum0, gat); - _mm_storel_epi64((__m128i *)dst, sum0); - dst += stride; - - rep = _mm_add_epi16(rep, one); - d = _mm_add_epi16(d, inc); - } -} - -void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i pixels[4]; - load_pixel_w8(above, left, 4, pixels); - - __m128i wh[4], ww[2]; - load_weight_w8(sm_weight_arrays, 4, wh, ww); - - smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); -} - -void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i pixels[4]; - load_pixel_w8(above, left, 8, pixels); - - __m128i wh[4], ww[2]; - load_weight_w8(sm_weight_arrays, 8, wh, ww); - - smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); -} - -void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[4]; - load_pixel_w8(above, left, 16, pixels); - - __m128i wh[4], ww[2]; - load_weight_w8(sm_weight_arrays, 16, wh, ww); - - smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); -} - -void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[8]; - load_pixel_w8(above, left, 32, pixels); - - __m128i wh[8], ww[2]; - load_weight_w8(sm_weight_arrays, 32, wh, ww); - - smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); - dst += stride << 3; - smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); -} - -static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left, uint32_t bw, - uint32_t bh) { - const uint8_t *const sm_weights_w = sm_weight_arrays + bw; - const uint8_t *const sm_weights_h = sm_weight_arrays + bh; - const __m128i zero = _mm_setzero_si128(); - const __m128i scale_value = - _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); - const __m128i dup16 = _mm_set1_epi32(0x01000100); - const __m128i top_right = - _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); - - for (uint32_t y = 0; y < bh; ++y) { - const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); - const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); - const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); - __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); - const __m128i wl_y = - _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); - pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); - pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); - - for (uint32_t x = 0; x < bw; x += 8) { - const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); - const __m128i weights_x = - _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); - const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); - const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); - const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); - - __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); - __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); - - const __m128i scale_m_weights_x = - _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); - const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); - const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); - const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); - - pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); - pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); - - pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); - pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); - - pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); - pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); - - __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); - pred = _mm_shuffle_epi8(pred, gat); - _mm_storel_epi64((__m128i *)(dst + x), pred); - } - dst += stride; - } -} - -void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 16, 4); -} - -void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 16, 8); -} - -void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 16, 16); -} - -void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 16, 32); -} - -void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 32, 8); -} - -void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 32, 16); -} - -void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 32, 32); -} - -void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 32, 64); -} - -void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 64, 64); -} - -void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 64, 32); -} - -void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 64, 16); -} - -void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_predictor_wxh(dst, stride, above, left, 16, 64); -} - -// ----------------------------------------------------------------------------- -// SMOOTH_V_PRED - -// pixels[0]: above and below_pred interleave vector -static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - const __m128i zero = _mm_setzero_si128(); - __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - d = _mm_unpacklo_epi8(d, zero); - pixels[0] = _mm_unpacklo_epi16(d, bp); -} - -// weights[0]: weights_h vector -// weights[1]: scale - weights_h vector -static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height, - __m128i *weights) { - const __m128i zero = _mm_setzero_si128(); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - - if (height == 4) { - const __m128i weight = - _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); - weights[0] = _mm_unpacklo_epi8(weight, zero); - weights[1] = _mm_sub_epi16(d, weights[0]); - } else if (height == 8) { - const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); - weights[0] = _mm_unpacklo_epi8(weight, zero); - weights[1] = _mm_sub_epi16(d, weights[0]); - } else { - const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); - weights[0] = _mm_unpacklo_epi8(weight, zero); - weights[1] = _mm_sub_epi16(d, weights[0]); - weights[2] = _mm_unpackhi_epi8(weight, zero); - weights[3] = _mm_sub_epi16(d, weights[2]); - } -} - -static INLINE void smooth_v_pred_4xh(const __m128i *pixel, - const __m128i *weight, int h, uint8_t *dst, - ptrdiff_t stride) { - const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); - const __m128i inc = _mm_set1_epi16(0x202); - const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i d = _mm_set1_epi16(0x100); - - for (int i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); - __m128i sum = _mm_madd_epi16(pixel[0], wh_sc); - sum = _mm_add_epi32(sum, pred_round); - sum = _mm_srai_epi32(sum, sm_weight_log2_scale); - sum = _mm_shuffle_epi8(sum, gat); - *(uint32_t *)dst = _mm_cvtsi128_si32(sum); - dst += stride; - d = _mm_add_epi16(d, inc); - } -} - -void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels; - load_pixel_v_w4(above, left, 4, &pixels); - - __m128i weights[2]; - load_weight_v_w4(sm_weight_arrays, 4, weights); - - smooth_v_pred_4xh(&pixels, weights, 4, dst, stride); -} - -void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels; - load_pixel_v_w4(above, left, 8, &pixels); - - __m128i weights[2]; - load_weight_v_w4(sm_weight_arrays, 8, weights); - - smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); -} - -void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels; - load_pixel_v_w4(above, left, 16, &pixels); - - __m128i weights[4]; - load_weight_v_w4(sm_weight_arrays, 16, weights); - - smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); - dst += stride << 3; - smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride); -} - -// pixels[0]: above and below_pred interleave vector, first half -// pixels[1]: above and below_pred interleave vector, second half -static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - const __m128i zero = _mm_setzero_si128(); - __m128i d = _mm_loadl_epi64((const __m128i *)above); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - d = _mm_unpacklo_epi8(d, zero); - pixels[0] = _mm_unpacklo_epi16(d, bp); - pixels[1] = _mm_unpackhi_epi16(d, bp); -} - -// weight_h[0]: weight_h vector -// weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], offset 8 -// weight_h[3]: same as [1], offset 8 -// weight_h[4]: same as [0], offset 16 -// weight_h[5]: same as [1], offset 16 -// weight_h[6]: same as [0], offset 24 -// weight_h[7]: same as [1], offset 24 -static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height, - __m128i *weight_h) { - const __m128i zero = _mm_setzero_si128(); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - - if (height < 16) { - const int offset = height < 8 ? 4 : 8; - const __m128i weight = - _mm_loadu_si128((const __m128i *)&weight_array[offset]); - weight_h[0] = _mm_unpacklo_epi8(weight, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - } else if (height == 16) { - const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); - weight_h[0] = _mm_unpacklo_epi8(weight, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(weight, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - } else { - const __m128i weight_lo = - _mm_loadu_si128((const __m128i *)&weight_array[32]); - weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - const __m128i weight_hi = - _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); - weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); - weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); - weight_h[7] = _mm_sub_epi16(d, weight_h[6]); - } -} - -static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh, - int h, uint8_t *dst, ptrdiff_t stride) { - const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); - const __m128i inc = _mm_set1_epi16(0x202); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - __m128i d = _mm_set1_epi16(0x100); - - for (int i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); - __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); - __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); - - s0 = _mm_add_epi32(s0, pred_round); - s0 = _mm_srai_epi32(s0, sm_weight_log2_scale); - - s1 = _mm_add_epi32(s1, pred_round); - s1 = _mm_srai_epi32(s1, sm_weight_log2_scale); - - __m128i sum01 = _mm_packus_epi16(s0, s1); - sum01 = _mm_shuffle_epi8(sum01, gat); - _mm_storel_epi64((__m128i *)dst, sum01); - dst += stride; - - d = _mm_add_epi16(d, inc); - } -} - -void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_v_w8(above, left, 4, pixels); - - __m128i wh[2]; - load_weight_v_w8(sm_weight_arrays, 4, wh); - - smooth_v_pred_8xh(pixels, wh, 4, dst, stride); -} - -void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_v_w8(above, left, 8, pixels); - - __m128i wh[2]; - load_weight_v_w8(sm_weight_arrays, 8, wh); - - smooth_v_pred_8xh(pixels, wh, 8, dst, stride); -} - -void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_v_w8(above, left, 16, pixels); - - __m128i wh[4]; - load_weight_v_w8(sm_weight_arrays, 16, wh); - - smooth_v_pred_8xh(pixels, wh, 8, dst, stride); - dst += stride << 3; - smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); -} - -void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_v_w8(above, left, 32, pixels); - - __m128i wh[8]; - load_weight_v_w8(sm_weight_arrays, 32, wh); - - smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride); - dst += stride << 3; - smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); - dst += stride << 3; - smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride); - dst += stride << 3; - smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride); -} - -static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left, uint32_t bw, - uint32_t bh) { - const uint8_t *const sm_weights_h = sm_weight_arrays + bh; - const __m128i zero = _mm_setzero_si128(); - const __m128i scale_value = - _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i dup16 = _mm_set1_epi32(0x01000100); - const __m128i bottom_left = - _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = - _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1))); - - for (uint32_t y = 0; y < bh; ++y) { - const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); - const __m128i scale_m_weights_y = - _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16); - const __m128i wl_y = - _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0); - - for (uint32_t x = 0; x < bw; x += 8) { - const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); - // 8 -> 16 - const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero); - const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y); - const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y); - // top_x * weights_y + scale_m_weights_y * bottom_left - __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); - __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); - - pred_lo = _mm_add_epi32(pred_lo, round); - pred_hi = _mm_add_epi32(pred_hi, round); - pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); - pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); - - __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); - pred = _mm_shuffle_epi8(pred, gat); - _mm_storel_epi64((__m128i *)(dst + x), pred); - } - dst += stride; - } -} - -void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 16, 4); -} - -void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 16, 8); -} - -void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 16, 16); -} - -void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 16, 32); -} - -void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 32, 8); -} - -void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 32, 16); -} - -void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 32, 32); -} - -void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 32, 64); -} - -void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 64, 64); -} - -void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 64, 32); -} - -void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 64, 16); -} - -void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_v_predictor_wxh(dst, stride, above, left, 16, 64); -} - -// ----------------------------------------------------------------------------- -// SMOOTH_H_PRED - -// pixels[0]: left vector -// pixels[1]: right_pred vector -static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - if (height == 4) - pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); - else if (height == 8) - pixels[0] = _mm_loadl_epi64(((const __m128i *)left)); - else - pixels[0] = _mm_loadu_si128(((const __m128i *)left)); - pixels[1] = _mm_set1_epi16((uint16_t)above[3]); -} - -// weights[0]: weights_w and scale - weights_w interleave vector -static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height, - __m128i *weights) { - (void)height; - const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); - const __m128i zero = _mm_setzero_si128(); - - const __m128i weights_0 = _mm_unpacklo_epi8(t, zero); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i weights_1 = _mm_sub_epi16(d, weights_0); - weights[0] = _mm_unpacklo_epi16(weights_0, weights_1); -} - -static INLINE void smooth_h_pred_4xh(const __m128i *pixel, - const __m128i *weight, int h, uint8_t *dst, - ptrdiff_t stride) { - const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); - const __m128i one = _mm_set1_epi16(1); - const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = _mm_set1_epi16(0x8000); - - for (int i = 0; i < h; ++i) { - __m128i b = _mm_shuffle_epi8(pixel[0], rep); - b = _mm_unpacklo_epi16(b, pixel[1]); - __m128i sum = _mm_madd_epi16(b, weight[0]); - - sum = _mm_add_epi32(sum, pred_round); - sum = _mm_srai_epi32(sum, sm_weight_log2_scale); - - sum = _mm_shuffle_epi8(sum, gat); - *(uint32_t *)dst = _mm_cvtsi128_si32(sum); - dst += stride; - - rep = _mm_add_epi16(rep, one); - } -} - -void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w4(above, left, 4, pixels); - - __m128i weights; - load_weight_h_w4(sm_weight_arrays, 4, &weights); - - smooth_h_pred_4xh(pixels, &weights, 4, dst, stride); -} - -void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w4(above, left, 8, pixels); - - __m128i weights; - load_weight_h_w4(sm_weight_arrays, 8, &weights); - - smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); -} - -void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w4(above, left, 16, pixels); - - __m128i weights; - load_weight_h_w4(sm_weight_arrays, 8, &weights); - - smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); - dst += stride << 3; - - pixels[0] = _mm_srli_si128(pixels[0], 8); - smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); -} - -// pixels[0]: left vector -// pixels[1]: right_pred vector -// pixels[2]: left vector + 16 -// pixels[3]: right_pred vector -static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - pixels[1] = _mm_set1_epi16((uint16_t)above[7]); - - if (height == 4) { - pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); - } else if (height == 8) { - pixels[0] = _mm_loadl_epi64((const __m128i *)left); - } else if (height == 16) { - pixels[0] = _mm_load_si128((const __m128i *)left); - } else { - pixels[0] = _mm_load_si128((const __m128i *)left); - pixels[2] = _mm_load_si128((const __m128i *)(left + 16)); - pixels[3] = pixels[1]; - } -} - -// weight_w[0]: weights_w and scale - weights_w interleave vector, first half -// weight_w[1]: weights_w and scale - weights_w interleave vector, second half -static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height, - __m128i *weight_w) { - (void)height; - const __m128i zero = _mm_setzero_si128(); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]); - const __m128i tmp1 = _mm_unpacklo_epi8(we, zero); - const __m128i tmp2 = _mm_sub_epi16(d, tmp1); - weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); - weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); -} - -static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, - int h, uint8_t *dst, ptrdiff_t stride, - int second_half) { - const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); - const __m128i one = _mm_set1_epi16(1); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); - - for (int i = 0; i < h; ++i) { - __m128i b = _mm_shuffle_epi8(pixels[0], rep); - b = _mm_unpacklo_epi16(b, pixels[1]); - __m128i sum0 = _mm_madd_epi16(b, ww[0]); - __m128i sum1 = _mm_madd_epi16(b, ww[1]); - - sum0 = _mm_add_epi32(sum0, pred_round); - sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale); - - sum1 = _mm_add_epi32(sum1, pred_round); - sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale); - - sum0 = _mm_packus_epi16(sum0, sum1); - sum0 = _mm_shuffle_epi8(sum0, gat); - _mm_storel_epi64((__m128i *)dst, sum0); - dst += stride; - - rep = _mm_add_epi16(rep, one); - } -} - -void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w8(above, left, 4, pixels); - - __m128i ww[2]; - load_weight_h_w8(sm_weight_arrays, 4, ww); - - smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0); -} - -void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w8(above, left, 8, pixels); - - __m128i ww[2]; - load_weight_h_w8(sm_weight_arrays, 8, ww); - - smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); -} - -void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[2]; - load_pixel_h_w8(above, left, 16, pixels); - - __m128i ww[2]; - load_weight_h_w8(sm_weight_arrays, 16, ww); - - smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1); -} - -void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[4]; - load_pixel_h_w8(above, left, 32, pixels); - - __m128i ww[2]; - load_weight_h_w8(sm_weight_arrays, 32, ww); - - smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1); - dst += stride << 3; - smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0); - dst += stride << 3; - smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1); -} - -static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left, uint32_t bw, - uint32_t bh) { - const uint8_t *const sm_weights_w = sm_weight_arrays + bw; - const __m128i zero = _mm_setzero_si128(); - const __m128i scale_value = - _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]); - const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); - - for (uint32_t y = 0; y < bh; ++y) { - const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); - const __m128i tr_ly = - _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0); - - for (uint32_t x = 0; x < bw; x += 8) { - const __m128i weights_x = - _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); - const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero); - const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw); - const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw); - const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw); - __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly); - __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly); - - pred_lo = _mm_add_epi32(pred_lo, pred_round); - pred_hi = _mm_add_epi32(pred_hi, pred_round); - - pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); - pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); - - __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); - pred = _mm_shuffle_epi8(pred, gat); - _mm_storel_epi64((__m128i *)(dst + x), pred); - } - dst += stride; - } -} - -void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 16, 4); -} - -void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 16, 8); -} - -void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 16, 16); -} - -void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 16, 32); -} - -void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 16, 64); -} - -void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 32, 8); -} - -void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 32, 16); -} - -void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 32, 32); -} - -void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 32, 64); -} - -void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 64, 64); -} - -void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 64, 32); -} - -void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - smooth_h_predictor_wxh(dst, stride, above, left, 64, 16); -} diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm deleted file mode 100644 index 0bc841a7a..000000000 --- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm +++ /dev/null @@ -1,107 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro REORDER_INPUTS 0 - ; a c d b to a b c d - SWAP 1, 3, 2 -%endmacro - -%macro TRANSFORM_COLS 0 - ; input: - ; m0 a - ; m1 b - ; m2 c - ; m3 d - paddw m0, m2 - psubw m3, m1 - - ; wide subtract - punpcklwd m4, m0 - punpcklwd m5, m3 - psrad m4, 16 - psrad m5, 16 - psubd m4, m5 - psrad m4, 1 - packssdw m4, m4 ; e - - psubw m5, m4, m1 ; b - psubw m4, m2 ; c - psubw m0, m5 - paddw m3, m4 - ; m0 a - SWAP 1, 5 ; m1 b - SWAP 2, 4 ; m2 c - ; m3 d -%endmacro - -%macro TRANSPOSE_4X4 0 - punpcklwd m0, m2 - punpcklwd m1, m3 - mova m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 -%macro TRANSPOSE_4X4_WIDE 0 - mova m3, m0 - punpcklwd m0, m1 - punpckhwd m3, m1 - mova m2, m0 - punpcklwd m0, m3 - punpckhwd m2, m3 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero - movd m%3, [outputq] - movd m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%1, m%3 - paddw m%2, m%4 - packuswb m%1, m%5 - packuswb m%2, m%5 - movd [outputq], m%1 - movd [outputq + strideq], m%2 -%endmacro - -INIT_XMM sse2 -cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - psraw m0, 2 - psraw m1, 2 - - TRANSPOSE_4X4_WIDE - REORDER_INPUTS - TRANSFORM_COLS - TRANSPOSE_4X4 - REORDER_INPUTS - TRANSFORM_COLS - - pxor m4, m4 - ADD_STORE_4P_2X 0, 1, 5, 6, 4 - lea outputq, [outputq + 2 * strideq] - ADD_STORE_4P_2X 2, 3, 5, 6, 4 - - RET diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c deleted file mode 100644 index c3c88245a..000000000 --- a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" - -unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i; - assert(width == 4); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; i += 4) { - __m128i x0 = xx_loadl_32(a + 0 * a_stride); - __m128i x1 = xx_loadl_32(a + 1 * a_stride); - __m128i x2 = xx_loadl_32(a + 2 * a_stride); - __m128i x3 = xx_loadl_32(a + 3 * a_stride); - __m128i x_lo = _mm_unpacklo_epi32(x0, x1); - __m128i x_hi = _mm_unpacklo_epi32(x2, x3); - - __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); - - x0 = xx_loadl_32(b + 0 * b_stride); - x1 = xx_loadl_32(b + 1 * b_stride); - x2 = xx_loadl_32(b + 2 * b_stride); - x3 = xx_loadl_32(b + 3 * b_stride); - x_lo = _mm_unpacklo_epi32(x0, x1); - x_hi = _mm_unpacklo_epi32(x2, x3); - - __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); - - __m128i sad4x4 = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad4x4); - - a += 4 * a_stride; - b += 4 * b_stride; - } - - // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i; - assert(width == 8); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; i += 2) { - __m128i x0 = xx_loadl_64(a + 0 * a_stride); - __m128i x1 = xx_loadl_64(a + 1 * a_stride); - - __m128i x = _mm_unpacklo_epi64(x0, x1); - - x0 = xx_loadl_64(b + 0 * b_stride); - x1 = xx_loadl_64(b + 1 * b_stride); - - __m128i y = _mm_unpacklo_epi64(x0, x1); - - __m128i sad8x2 = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad8x2); - - a += 2 * a_stride; - b += 2 * b_stride; - } - - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i; - assert(width == 16); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; ++i) { - __m128i x = xx_loadu_128(a); - __m128i y = xx_loadu_128(b); - - __m128i sad16x1 = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad16x1); - - a += a_stride; - b += b_stride; - } - - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i, j; - assert(width == 32); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; ++i) { - for (j = 0; j < 2; ++j) { - __m128i x = xx_loadu_128(a + j * 16); - __m128i y = xx_loadu_128(b + j * 16); - - __m128i sad32_half = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad32_half); - } - - a += a_stride; - b += b_stride; - } - - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i, j; - assert(width == 64); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; ++i) { - for (j = 0; j < 4; ++j) { - __m128i x = xx_loadu_128(a + j * 16); - __m128i y = xx_loadu_128(b + j * 16); - - __m128i sad64_quarter = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad64_quarter); - } - - a += a_stride; - b += b_stride; - } - - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int i, j; - assert(width == 128); - (void)width; - - __m128i sad = _mm_setzero_si128(); - for (i = 0; i < height; ++i) { - for (j = 0; j < 8; ++j) { - __m128i x = xx_loadu_128(a + j * 16); - __m128i y = xx_loadu_128(b + j * 16); - - __m128i sad64_quarter = _mm_sad_epu8(x, y); - sad = _mm_add_epi32(sad, sad64_quarter); - } - - a += a_stride; - b += b_stride; - } - - const unsigned int res = - _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); - - return res; -} - -#define jnt_sadMxN_sse2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ - return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ - } - -#define jnt_sadMxN_avx2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ - return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ - } - -/* clang-format off */ -jnt_sadMxN_sse2(128, 128) -jnt_sadMxN_sse2(128, 64) -jnt_sadMxN_sse2(64, 128) -jnt_sadMxN_sse2(64, 64) -jnt_sadMxN_sse2(64, 32) -jnt_sadMxN_sse2(32, 64) -jnt_sadMxN_sse2(32, 32) -jnt_sadMxN_sse2(32, 16) -jnt_sadMxN_sse2(16, 32) -jnt_sadMxN_sse2(16, 16) -jnt_sadMxN_sse2(16, 8) -jnt_sadMxN_sse2(8, 16) -jnt_sadMxN_sse2(8, 8) -jnt_sadMxN_sse2(8, 4) -jnt_sadMxN_sse2(4, 8) -jnt_sadMxN_sse2(4, 4) -jnt_sadMxN_sse2(4, 16) -jnt_sadMxN_sse2(16, 4) -jnt_sadMxN_sse2(8, 32) -jnt_sadMxN_sse2(32, 8) -jnt_sadMxN_sse2(16, 64) -jnt_sadMxN_sse2(64, 16) - /* clang-format on */ diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c deleted file mode 100644 index f9a41a210..000000000 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" - -void aom_var_filter_block2d_bil_first_pass_ssse3( - const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -void aom_var_filter_block2d_bil_second_pass_ssse3( - const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, - const __m128i *w, const __m128i *r, - void *const result) { - __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); - __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); - __m128i round_lo = _mm_add_epi16(mult_lo, *r); - __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); - - __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); - __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); - __m128i round_hi = _mm_add_epi16(mult_hi, *r); - __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); - - xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); -} - -void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { - int i; - const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; - const uint8_t w1 = (uint8_t)jcp_param->bck_offset; - const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, - w1, w0, w1, w0); - const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); - const __m128i r = - _mm_set_epi16(round, round, round, round, round, round, round, round); - - if (width >= 16) { - // Read 16 pixels one row at a time - assert(!(width & 15)); - for (i = 0; i < height; ++i) { - int j; - for (j = 0; j < width; j += 16) { - __m128i p0 = xx_loadu_128(ref); - __m128i p1 = xx_loadu_128(pred); - - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); - - comp_pred += 16; - pred += 16; - ref += 16; - } - ref += ref_stride - width; - } - } else if (width >= 8) { - // Read 8 pixels two row at a time - assert(!(width & 7)); - assert(!(width & 1)); - for (i = 0; i < height; i += 2) { - __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); - __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); - __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); - __m128i p1 = xx_loadu_128(pred); - - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); - - comp_pred += 16; - pred += 16; - ref += 2 * ref_stride; - } - } else { - // Read 4 pixels four row at a time - assert(!(width & 3)); - assert(!(height & 3)); - for (i = 0; i < height; i += 4) { - const uint8_t *row0 = ref + 0 * ref_stride; - const uint8_t *row1 = ref + 1 * ref_stride; - const uint8_t *row2 = ref + 2 * ref_stride; - const uint8_t *row3 = ref + 3 * ref_stride; - - __m128i p0 = - _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], - row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], - row3[0], row3[1], row3[2], row3[3]); - __m128i p1 = xx_loadu_128(pred); - - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); - - comp_pred += 16; - pred += 16; - ref += 4 * ref_stride; - } - } -} - -void aom_jnt_comp_avg_upsampled_pred_ssse3( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { - int n; - int i; - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); - /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ - assert(!(width * height & 15)); - n = width * height >> 4; - - const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; - const uint8_t w1 = (uint8_t)jcp_param->bck_offset; - const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, - w1, w0, w1, w0); - const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); - const __m128i r = - _mm_set_epi16(round, round, round, round, round, round, round, round); - - for (i = 0; i < n; i++) { - __m128i p0 = xx_loadu_128(comp_pred); - __m128i p1 = xx_loadu_128(pred); - - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); - - comp_pred += 16; - pred += 16; - } -} - -#define JNT_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_ssse3( \ - a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_ssse3( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ - jcp_param); \ - \ - return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ - } - -JNT_SUBPIX_AVG_VAR(128, 128) -JNT_SUBPIX_AVG_VAR(128, 64) -JNT_SUBPIX_AVG_VAR(64, 128) -JNT_SUBPIX_AVG_VAR(64, 64) -JNT_SUBPIX_AVG_VAR(64, 32) -JNT_SUBPIX_AVG_VAR(32, 64) -JNT_SUBPIX_AVG_VAR(32, 32) -JNT_SUBPIX_AVG_VAR(32, 16) -JNT_SUBPIX_AVG_VAR(16, 32) -JNT_SUBPIX_AVG_VAR(16, 16) -JNT_SUBPIX_AVG_VAR(16, 8) -JNT_SUBPIX_AVG_VAR(8, 16) -JNT_SUBPIX_AVG_VAR(8, 8) -JNT_SUBPIX_AVG_VAR(8, 4) -JNT_SUBPIX_AVG_VAR(4, 8) -JNT_SUBPIX_AVG_VAR(4, 4) -JNT_SUBPIX_AVG_VAR(4, 16) -JNT_SUBPIX_AVG_VAR(16, 4) -JNT_SUBPIX_AVG_VAR(8, 32) -JNT_SUBPIX_AVG_VAR(32, 8) -JNT_SUBPIX_AVG_VAR(16, 64) -JNT_SUBPIX_AVG_VAR(64, 16) diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c deleted file mode 100644 index 9d88b5e49..000000000 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ /dev/null @@ -1,2385 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" -#include "aom_ports/mem.h" -#include "aom_ports/emmintrin_compat.h" - -static INLINE __m128i abs_diff(__m128i a, __m128i b) { - return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); -} - -static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - *d0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - - *d1 = _mm_srli_si128(*d0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(*d0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(*d0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, __m128i *d4, - __m128i *d5, __m128i *d6, - __m128i *d7) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1, ww0, ww1; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - - *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - *d1 = _mm_srli_si128(ww0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(ww0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(ww0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - *d5 = _mm_srli_si128(ww1, - 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - *d6 = _mm_srli_si128(ww1, - 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - *d7 = _mm_srli_si128(ww1, - 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0, - __m128i *d1, __m128i *d2, - __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - // output - // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx - // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx - // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx - // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx - - __m128i w0, w1, w2, w3, w4, w5; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d1 = _mm_srli_si128(*d0, 8); - *d2 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - *d3 = _mm_srli_si128(*d2, 8); -} - -static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0d1, - __m128i *d2d3, __m128i *d4d5, - __m128i *d6d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7; - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0d1 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d2d3 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - - w6 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - w7 = _mm_unpackhi_epi16( - w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - - *d4d5 = _mm_unpacklo_epi32( - w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - *d6d7 = _mm_unpackhi_epi32( - w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 -} - -static INLINE void transpose16x8_8x16_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, - __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, - __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, - __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpacklo_epi8(*x8, *x9); - w9 = _mm_unpacklo_epi8(*x10, *x11); - w10 = _mm_unpacklo_epi8(*x12, *x13); - w11 = _mm_unpacklo_epi8(*x14, *x15); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0 = _mm_unpacklo_epi64(w6, w14); - *d1 = _mm_unpackhi_epi64(w6, w14); - *d2 = _mm_unpacklo_epi64(w7, w15); - *d3 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d4 = _mm_unpacklo_epi64(w6, w14); - *d5 = _mm_unpackhi_epi64(w6, w14); - *d6 = _mm_unpacklo_epi64(w7, w15); - *d7 = _mm_unpackhi_epi64(w7, w15); -} - -// this function treats its input as 2 parallel 8x4 matrices, transposes each of -// them independently while flipping the second matrix horizontaly Used for 14 -// taps filter pq pairs inverse -static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, - __m128i *pq0, __m128i *pq1, - __m128i *pq2, __m128i *pq3) { - __m128i w10, w11, w12, w13; - __m128i w0, w1, w2, w3, w4, w5; - __m128i d0, d1, d2, d3; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - w2 = _mm_unpacklo_epi8( - *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - w3 = _mm_unpacklo_epi8( - *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - d0 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - d2 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - - w10 = _mm_unpacklo_epi8( - *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 - w11 = _mm_unpacklo_epi8( - *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 - w12 = _mm_unpacklo_epi8( - *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 - w13 = _mm_unpacklo_epi8( - *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 - - w4 = _mm_unpackhi_epi16( - w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpackhi_epi16( - w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - d1 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - d3 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - - *pq0 = _mm_unpacklo_epi64(d0, d1); // pq - *pq1 = _mm_unpackhi_epi64(d0, d1); // pq - *pq2 = _mm_unpacklo_epi64(d2, d3); // pq - *pq3 = _mm_unpackhi_epi64(d2, d3); // pq -} - -static INLINE void transpose8x16_16x8_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, - __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, - __m128i *d12d13, __m128i *d14d15) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpackhi_epi8(*x0, *x1); - w9 = _mm_unpackhi_epi8(*x2, *x3); - w10 = _mm_unpackhi_epi8(*x4, *x5); - w11 = _mm_unpackhi_epi8(*x6, *x7); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0d1 = _mm_unpacklo_epi64(w6, w14); - *d2d3 = _mm_unpackhi_epi64(w6, w14); - *d4d5 = _mm_unpacklo_epi64(w7, w15); - *d6d7 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d8d9 = _mm_unpacklo_epi64(w6, w14); - *d10d11 = _mm_unpackhi_epi64(w6, w14); - *d12d13 = _mm_unpacklo_epi64(w7, w15); - *d14d15 = _mm_unpackhi_epi64(w7, w15); -} - -// this function treats its input as 2 parallel 8x4 matrices, transposes each of -// them to 4x8 independently while flipping the second matrix horizontaly. Used -// for 14 taps pq pairs creation -static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *q0p0, - __m128i *q1p1, __m128i *q2p2, - __m128i *q3p3, __m128i *q4p4, - __m128i *q5p5, __m128i *q6p6, - __m128i *q7p7) { - __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - w2 = _mm_unpackhi_epi8( - *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 - w3 = _mm_unpackhi_epi8( - *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 - - ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - ww2 = _mm_unpacklo_epi16( - w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 - ww3 = _mm_unpackhi_epi16( - w2, - w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 - - *q7p7 = _mm_unpacklo_epi32( - ww0, - _mm_srli_si128( - ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx - *q6p6 = _mm_unpackhi_epi32( - _mm_slli_si128(ww0, 4), - ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx - *q5p5 = _mm_unpackhi_epi32( - ww0, - _mm_slli_si128( - ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx - *q4p4 = _mm_unpacklo_epi32( - _mm_srli_si128(ww0, 12), - ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx - *q3p3 = _mm_unpacklo_epi32( - ww1, - _mm_srli_si128( - ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx - *q2p2 = _mm_unpackhi_epi32( - _mm_slli_si128(ww1, 4), - ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx - *q1p1 = _mm_unpackhi_epi32( - ww1, - _mm_slli_si128( - ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx - *q0p0 = _mm_unpacklo_epi32( - _mm_srli_si128(ww1, 12), - ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx -} - -static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, - __m128i *hev, __m128i *mask, - __m128i *qs1qs0, __m128i *ps1ps0) { - __m128i filter, filter2filter1, work; - __m128i ps1ps0_work, qs1qs0_work; - __m128i hev1; - const __m128i t3t4 = - _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i ff = _mm_cmpeq_epi8(t80, t80); - - ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ - qs1qs0_work = _mm_xor_si128(*q1q0, t80); - - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ - work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); - filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ - filter = _mm_subs_epi8(filter, work); - filter = _mm_subs_epi8(filter, work); - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ - filter = _mm_and_si128(filter, *mask); /* & mask */ - filter = _mm_unpacklo_epi32(filter, filter); - - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ - filter2filter1 = - _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); - - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ - filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit - filter = _mm_srai_epi16(filter, 9); /* round */ - filter = _mm_packs_epi16(filter, filter); - filter = _mm_andnot_si128(*hev, filter); - filter = _mm_unpacklo_epi32(filter, filter); - - filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); - hev1 = _mm_srli_si128(filter2filter1, 8); - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ - qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ - ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); - - *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ - *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ -} - -static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, - __m128i *hev, __m128i *mask, - __m128i *qs1qs0, - __m128i *ps1ps0) { - const __m128i t3t4 = - _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); - const __m128i t80 = _mm_set1_epi8(0x80); - __m128i filter, filter2filter1, work; - __m128i ps1ps0_work, qs1qs0_work; - __m128i hev1; - const __m128i ff = _mm_cmpeq_epi8(t80, t80); - - ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ - qs1qs0_work = _mm_xor_si128(*q1q0, t80); - - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ - work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); - filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ - filter = _mm_subs_epi8(filter, work); - filter = _mm_subs_epi8(filter, work); - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ - filter = _mm_and_si128(filter, *mask); /* & mask */ - filter = _mm_unpacklo_epi64(filter, filter); - - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ - filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); - filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ - filter = _mm_srai_epi16(filter, 11); /* >> 3 */ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter); - - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ - filter = _mm_unpacklo_epi8(filter, filter); - filter = _mm_srai_epi16(filter, 9); /* round */ - filter = _mm_packs_epi16(filter, filter); - filter = _mm_andnot_si128(*hev, filter); - - hev1 = _mm_unpackhi_epi64(filter2filter1, filter); - filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); - - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ - qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ - ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); - *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ - *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ -} - -static AOM_FORCE_INLINE void lpf_internal_4_sse2( - __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, - __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { - __m128i q1p1, q0p0, p1p0, q1q0; - __m128i abs_p0q0, abs_p1q1; - __m128i mask, flat, hev; - const __m128i zero = _mm_setzero_si128(); - - q1p1 = _mm_unpacklo_epi32(*p1, *q1); - q0p0 = _mm_unpacklo_epi32(*p0, *q0); - - p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); - q1q0 = _mm_srli_si128(p1p0, 8); - - /* (abs(q1 - q0), abs(p1 - p0) */ - flat = abs_diff(q1p1, q0p0); - /* abs(p1 - q1), abs(p0 - q0) */ - __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); - - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); - hev = _mm_unpacklo_epi8(flat, zero); - - hev = _mm_cmpgt_epi16(hev, *thresh); - hev = _mm_packs_epi16(hev, hev); - hev = _mm_unpacklo_epi32(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ - abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ - abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ - - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); - mask = _mm_unpacklo_epi32(mask, flat); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); - - filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); -} - -static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( - __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, - __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { - __m128i q1p1, q0p0, p1p0, q1q0; - __m128i abs_p0q0, abs_p1q1; - __m128i mask, hev; - const __m128i zero = _mm_setzero_si128(); - - q1p1 = _mm_unpacklo_epi64(*p1, *q1); - q0p0 = _mm_unpacklo_epi64(*p0, *q0); - - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); - - /* (abs(q1 - q0), abs(p1 - p0) */ - __m128i flat = abs_diff(q1p1, q0p0); - /* abs(p1 - q1), abs(p0 - q0) */ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); - - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - hev = _mm_unpacklo_epi8(flat, zero); - - hev = _mm_cmpgt_epi16(hev, *thresh); - hev = _mm_packs_epi16(hev, hev); - - /* const int8_t mask = filter_mask2(*limit, *blimit, */ - /* p1, p0, q0, q1); */ - abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ - abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); - mask = _mm_unpacklo_epi64(mask, flat); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); - - filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); -} - -void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - - __m128i qs1qs0, ps1ps0; - __m128i p1, p0, q0, q1; - - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); - - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); - - xx_storel_32(s - 1 * p, ps1ps0); - xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); - xx_storel_32(s + 0 * p, qs1qs0); - xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); -} - -void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - __m128i p1p0, q1q0; - __m128i p1, p0, q0, q1; - - const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - - __m128i x0, x1, x2, x3; - __m128i d0, d1, d2, d3; - x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); - x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); - x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - - transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); - - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); - - // Transpose 8x4 to 4x8 - p1 = _mm_srli_si128(p1p0, 4); - q1 = _mm_srli_si128(q1q0, 4); - - transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); - - xx_storel_32(s + 0 * p - 2, d0); - xx_storel_32(s + 1 * p - 2, d1); - xx_storel_32(s + 2 * p - 2, d2); - xx_storel_32(s + 3 * p - 2, d3); -} - -static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { - xx_storel_32(s - (num + 1) * p, x); - xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); -} - -static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( - __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, - __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, - __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi8(1); - __m128i mask, hev, flat, flat2; - __m128i qs0ps0, qs1ps1; - __m128i p1p0, q1q0, qs1qs0, ps1ps0; - __m128i abs_p1p0; - - p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); - q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0; - __m128i fe, ff, work; - abs_p1p0 = abs_diff(*q1p1, *q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = abs_diff(p1p0, q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); - abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi64(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - the same for 6, 8 and 14 versions - filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); - qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); - qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); - // loopfilter done - - __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - __m128i work; - flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - // if flat ==0 then flat2 is zero as well and we don't need any calc below - // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p6, sum_q6; - __m128i sum_p3, sum_q3, res_p, res_q; - - p6_16 = _mm_unpacklo_epi8(*q6p6, zero); - p5_16 = _mm_unpacklo_epi8(*q5p5, zero); - p4_16 = _mm_unpacklo_epi8(*q4p4, zero); - p3_16 = _mm_unpacklo_epi8(*q3p3, zero); - p2_16 = _mm_unpacklo_epi8(*q2p2, zero); - p1_16 = _mm_unpacklo_epi8(*q1p1, zero); - p0_16 = _mm_unpacklo_epi8(*q0p0, zero); - q0_16 = _mm_unpackhi_epi8(*q0p0, zero); - q1_16 = _mm_unpackhi_epi8(*q1p1, zero); - q2_16 = _mm_unpackhi_epi8(*q2p2, zero); - q3_16 = _mm_unpackhi_epi8(*q3p3, zero); - q4_16 = _mm_unpackhi_epi8(*q4p4, zero); - q5_16 = _mm_unpackhi_epi8(*q5p5, zero); - q6_16 = _mm_unpackhi_epi8(*q6p6, zero); - pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, - _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), - _mm_add_epi16(p1_16, q0_16))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, - _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), - _mm_add_epi16(p0_16, q1_16))), - 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(p6_16, p6_16); - sum_q6 = _mm_add_epi16(q6_16, q6_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), - 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - // work with flat2 - flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - work = abs_diff(*q6p6, *q0p0); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - - // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - flat = _mm_unpacklo_epi64(flat, flat); - *q2p2 = _mm_andnot_si128(flat, *q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - flat2 = _mm_unpacklo_epi64(flat2, flat2); - - *q5p5 = _mm_andnot_si128(flat2, *q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); - - *q4p4 = _mm_andnot_si128(flat2, *q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); - - *q3p3 = _mm_andnot_si128(flat2, *q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); - - *q2p2 = _mm_andnot_si128(flat2, *q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); - - *q1p1 = _mm_andnot_si128(flat2, *q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); - - *q0p0 = _mm_andnot_si128(flat2, *q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); - } - } else { - *q0p0 = qs0ps0; - *q1p1 = qs1ps1; - } -} - -static AOM_FORCE_INLINE void lpf_internal_14_sse2( - __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, - __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, - __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi8(1); - __m128i mask, hev, flat, flat2; - __m128i flat2_pq[6], flat_pq[3]; - __m128i qs0ps0, qs1ps1; - __m128i p1p0, q1q0, qs1qs0, ps1ps0; - __m128i abs_p1p0; - - p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); - q1q0 = _mm_srli_si128(p1p0, 8); - - __m128i fe, ff, work; - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0; - abs_p1p0 = abs_diff(*q1p1, *q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(fe, fe); - abs_p0q0 = abs_diff(p1p0, q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi32(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_unpacklo_epi32(mask, zero); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); - qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); - qs1ps1 = _mm_srli_si128(qs0ps0, 8); - // loopfilter done - - flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - flat = _mm_unpacklo_epi32(flat, flat); - flat = _mm_unpacklo_epi64(flat, flat); - - // if flat ==0 then flat2 is zero as well and we don't need any calc below - // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pq_16[7]; - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i sum_p6; - __m128i sum_p3; - - pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); - pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); - pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); - pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); - pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); - pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); - pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); - q0_16 = _mm_srli_si128(pq_16[0], 8); - q1_16 = _mm_srli_si128(pq_16[1], 8); - q2_16 = _mm_srli_si128(pq_16[2], 8); - q3_16 = _mm_srli_si128(pq_16[3], 8); - q4_16 = _mm_srli_si128(pq_16[4], 8); - q5_16 = _mm_srli_si128(pq_16[5], 8); - - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - - __m128i work0, work0_0, work0_1, sum_p_0; - __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); - __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); - - __m128i sum_lq = _mm_srli_si128(sum_lp, 8); - __m128i sum_q = _mm_srli_si128(sum_p, 8); - - sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); - sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); - flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); - - sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); - sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); - - sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); - sum_p = _mm_sub_epi16(sum_p_0, q5_16); - - work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); - work0_1 = _mm_add_epi16( - sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); - - sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); - sum_lp = _mm_sub_epi16(sum_lp, q2_16); - - work0 = _mm_add_epi16(sum_p3, pq_16[1]); - flat_p[1] = _mm_add_epi16(sum_lp, work0); - flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); - - flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); - flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); - flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); - - sum_lp = _mm_sub_epi16(sum_lp, q1_16); - sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); - - sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); - work0 = _mm_add_epi16(sum_p3, pq_16[2]); - - flat_p[2] = _mm_add_epi16(sum_lp, work0); - flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); - flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - - work = abs_diff(*q6p6, *q0p0); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - flat2 = _mm_unpacklo_epi32(flat2, flat2); - - // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); - *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); - *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); - - *q2p2 = _mm_andnot_si128(flat, *q2p2); - flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); - *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); - - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { - flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); - flat2_q[0] = _mm_add_epi16( - sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); - - flat2_p[1] = _mm_add_epi16(sum_p, work0_1); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); - - flat2_pq[0] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); - flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); - - sum_p = _mm_sub_epi16(sum_p, q4_16); - sum_q = _mm_sub_epi16(sum_q, pq_16[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); - work0 = _mm_add_epi16( - sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); - - sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); - sum_p = _mm_sub_epi16(sum_p, q3_16); - sum_q = _mm_sub_epi16(sum_q, pq_16[3]); - - work0 = _mm_add_epi16( - sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); - - sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); - sum_p = _mm_sub_epi16(sum_p, q2_16); - sum_q = _mm_sub_epi16(sum_q, pq_16[2]); - - work0 = _mm_add_epi16( - sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); - sum_p = _mm_sub_epi16(sum_p, q1_16); - sum_q = _mm_sub_epi16(sum_q, pq_16[1]); - - work0 = _mm_add_epi16( - sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = - _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - *q0p0 = _mm_andnot_si128(flat2, *q0p0); - flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); - *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); - - *q1p1 = _mm_andnot_si128(flat2, *q1p1); - flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); - *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); - - *q2p2 = _mm_andnot_si128(flat2, *q2p2); - flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); - *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); - - *q3p3 = _mm_andnot_si128(flat2, *q3p3); - flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); - *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); - - *q4p4 = _mm_andnot_si128(flat2, *q4p4); - flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); - *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); - - *q5p5 = _mm_andnot_si128(flat2, *q5p5); - flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); - *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); - } - } else { - *q0p0 = qs0ps0; - *q1p1 = qs1ps1; - } -} - -void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - __m128i limit = _mm_load_si128((const __m128i *)_limit); - __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - - q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), - _mm_cvtsi32_si128(*(int *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), - _mm_cvtsi32_si128(*(int *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), - _mm_cvtsi32_si128(*(int *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), - _mm_cvtsi32_si128(*(int *)(s + 1 * p))); - - q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), - _mm_cvtsi32_si128(*(int *)(s - 0 * p))); - - q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), - _mm_cvtsi32_si128(*(int *)(s + 5 * p))); - - q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), - _mm_cvtsi32_si128(*(int *)(s + 6 * p))); - - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); - - store_buffer_horz_8(q0p0, p, 0, s); - store_buffer_horz_8(q1p1, p, 1, s); - store_buffer_horz_8(q2p2, p, 2, s); - store_buffer_horz_8(q3p3, p, 3, s); - store_buffer_horz_8(q4p4, p, 4, s); - store_buffer_horz_8(q5p5, p, 5, s); -} - -static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( - __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, - __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, - __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - __m128i mask, hev, flat; - __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; - __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; - __m128i ps1ps0, qs1qs0; - - q2p2 = _mm_unpacklo_epi64(*p2, *q2); - q1p1 = _mm_unpacklo_epi64(*p1, *q1); - q0p0 = _mm_unpacklo_epi64(*p0, *q0); - - *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); - - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(fe, fe); - - { - // filter_mask and hev_mask - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - - abs_p0q0 = abs_diff(*p1p0, *q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); - abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); - - // considering sse doesn't have unsigned elements comparison the idea is - // to find at least one case when X > limit, it means the corresponding - // mask bit is set. - // to achieve that we find global max value of all inputs of abs(x-y) or - // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set - // otherwise - not - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi64(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = abs_diff(q2p2, q1p1); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // lp filter - the same for 6, 8 and 14 versions - filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); - - // flat_mask - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi64(flat, flat); - } - - // 5 tap filter - // need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; - p2_16 = _mm_unpacklo_epi8(*p2, zero); - p1_16 = _mm_unpacklo_epi8(*p1, zero); - p0_16 = _mm_unpacklo_epi8(*p0, zero); - q0_16 = _mm_unpacklo_epi8(*q0, zero); - q1_16 = _mm_unpacklo_epi8(*q1, zero); - q2_16 = _mm_unpacklo_epi8(*q2, zero); - - // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), - _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - p2_16); // p2 + p0 * 2 + p1 * 2 + 4 - - workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), - 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 - - // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 - workp_a = _mm_add_epi16(workp_a, - workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 - workp_shft1 = _mm_srli_epi16(workp_a, 3); - - flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); - - // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), - p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 - workp_b = _mm_add_epi16(q1_16, q2_16); - workp_a = _mm_add_epi16( - workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 - workp_shft0 = _mm_srli_epi16(workp_a, 3); - - // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), - p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 - workp_b = _mm_add_epi16(q2_16, q2_16); - workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), - 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - - flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0); - *q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0 = _mm_or_si128(qs1qs0, *q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0); - *p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0 = _mm_or_si128(ps1ps0, *p1p0); - } -} - -static AOM_FORCE_INLINE void lpf_internal_6_sse2( - __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, - __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, - __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - __m128i mask, hev, flat; - __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; - __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; - __m128i ps1ps0, qs1qs0; - - q2p2 = _mm_unpacklo_epi32(*p2, *q2); - q1p1 = _mm_unpacklo_epi32(*p1, *q1); - q0p0 = _mm_unpacklo_epi32(*p0, *q0); - - *p1p0 = _mm_unpacklo_epi32(*p0, *p1); - *q1q0 = _mm_unpacklo_epi32(*q0, *q1); - - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(fe, fe); - { - // filter_mask and hev_mask - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); - - abs_p0q0 = abs_diff(*p1p0, *q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - - // considering sse doesn't have unsigned elements comparison the idea is - // to find at least one case when X > limit, it means the corresponding - // mask bit is set. - // to achieve that we find global max value of all inputs of abs(x-y) or - // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set - // otherwise - not - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi32(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_unpacklo_epi32(mask, zero); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = abs_diff(q2p2, q1p1); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); - - // flat_mask - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi32(flat, flat); - flat = _mm_unpacklo_epi64(flat, flat); - } - - // 5 tap filter - // need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_c; - __m128i pq0x2_pq1, pq1_pq2; - pq2_16 = _mm_unpacklo_epi8(q2p2, zero); - pq1_16 = _mm_unpacklo_epi8(q1p1, zero); - pq0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_srli_si128(pq0_16, 8); - q2_16 = _mm_srli_si128(pq2_16, 8); - - // op1 - pq0x2_pq1 = - _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 - pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 - workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), - pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - - workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); - workp_b = - _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 - - // op0 - workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 - workp_a = _mm_add_epi16(workp_a, - workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 - workp_b = _mm_unpacklo_epi64(workp_a, workp_b); - workp_b = _mm_srli_epi16(workp_b, 3); - - flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); - - // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), - pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 - workp_b = _mm_srli_si128(pq1_pq2, 8); - workp_a = _mm_add_epi16( - workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 - // workp_shft0 = _mm_srli_epi16(workp_a, 3); - - // oq1 - workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), - pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 - workp_b = _mm_add_epi16(q2_16, q2_16); - workp_b = - _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - - workp_a = _mm_unpacklo_epi64(workp_a, workp_b); - workp_a = _mm_srli_epi16(workp_a, 3); - - flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0); - *q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0 = _mm_or_si128(qs1qs0, *q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0); - *p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0 = _mm_or_si128(ps1ps0, *p1p0); - } -} - -void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i p2, p1, p0, q0, q1, q2; - __m128i p1p0, q1q0; - __m128i blimit = _mm_load_si128((__m128i *)_blimit); - __m128i limit = _mm_load_si128((__m128i *)_limit); - __m128i thresh = _mm_load_si128((__m128i *)_thresh); - - p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); - q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); - - lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, - &limit, &thresh); - - xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); - xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); -} - -void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { - __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), - _mm_load_si128((__m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), - _mm_load_si128((__m128i *)_limit1)); - __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), - _mm_load_si128((__m128i *)_thresh1)); - - __m128i p2, p1, p0, q0, q1, q2; - __m128i p1p0, q1q0; - - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - - lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, - &limit, &thresh); - - _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); - _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); -} - -static AOM_FORCE_INLINE void lpf_internal_8_sse2( - __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, - __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - __m128i *blimit, __m128i *limit, __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - __m128i mask, hev, flat; - __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, - flat_p1p0, flat_q0q1; - __m128i q2p2, q1p1, q0p0; - __m128i q1q0, p1p0, ps1ps0, qs1qs0; - __m128i work_pq, opq2, pq2; - - q3p3 = _mm_unpacklo_epi32(*p3, *q3); - q2p2 = _mm_unpacklo_epi32(*p2, *q2); - q1p1 = _mm_unpacklo_epi32(*p1, *q1); - q0p0 = _mm_unpacklo_epi32(*p0, *q0); - - p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 - q1q0 = _mm_srli_si128(p1p0, 8); - - // filter_mask and hev_mask - - // considering sse doesn't have unsigned elements comparison the idea is to - // find at least one case when X > limit, it means the corresponding mask - // bit is set. - // to achieve that we find global max value of all inputs of abs(x-y) or - // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set - // otherwise - not - - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(fe, fe); - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); - - abs_p0q0 = abs_diff(p1p0, q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi32(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_unpacklo_epi32(mask, zero); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); - - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); - - // flat_mask4 - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi32(flat, flat); - flat = _mm_unpacklo_epi64(flat, flat); - - // filter8 need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; - p2_16 = _mm_unpacklo_epi8(*p2, zero); - p1_16 = _mm_unpacklo_epi8(*p1, zero); - p0_16 = _mm_unpacklo_epi8(*p0, zero); - q0_16 = _mm_unpacklo_epi8(*q0, zero); - q1_16 = _mm_unpacklo_epi8(*q1, zero); - q2_16 = _mm_unpacklo_epi8(*q2, zero); - p3_16 = _mm_unpacklo_epi8(*p3, zero); - q3_16 = _mm_unpacklo_epi8(*q3, zero); - - // op2 - workp_a = - _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); - workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); - workp_shft2 = _mm_add_epi16(workp_a, workp_b); - - // op1 - workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); - workp_c = _mm_add_epi16(workp_a, workp_b); - // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // op0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); - workp_d = _mm_add_epi16(workp_a, workp_b); - // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - workp_c = _mm_unpacklo_epi64(workp_d, workp_c); - workp_c = _mm_srli_epi16(workp_c, 3); - flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); - - // oq0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); - // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - workp_c = _mm_add_epi16(workp_a, workp_b); - - // oq1 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); - workp_d = _mm_add_epi16(workp_a, workp_b); - // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - workp_c = _mm_unpacklo_epi64(workp_c, workp_d); - workp_c = _mm_srli_epi16(workp_c, 3); - flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); - - // oq2 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); - workp_shft1 = _mm_add_epi16(workp_a, workp_b); - - workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); - workp_c = _mm_srli_epi16(workp_c, 3); - - opq2 = _mm_packus_epi16(workp_c, workp_c); - - work_pq = _mm_andnot_si128(flat, q2p2); - pq2 = _mm_and_si128(flat, opq2); - *p2 = _mm_or_si128(work_pq, pq2); - *q2 = _mm_srli_si128(*p2, 4); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - } -} - -static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( - __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, - __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - __m128i *blimit, __m128i *limit, __m128i *thresh) { - const __m128i zero = _mm_setzero_si128(); - __m128i mask, hev, flat; - __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, - flat_p1p0, flat_q0q1; - __m128i q2p2, q1p1, q0p0; - __m128i q1q0, p1p0, ps1ps0, qs1qs0; - __m128i work_pq, opq2, pq2; - - q3p3 = _mm_unpacklo_epi64(*p3, *q3); - q2p2 = _mm_unpacklo_epi64(*p2, *q2); - q1p1 = _mm_unpacklo_epi64(*p1, *q1); - q0p0 = _mm_unpacklo_epi64(*p0, *q0); - - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); - - { - // filter_mask and hev_mask - - // considering sse doesn't have unsigned elements comparison the idea is to - // find at least one case when X > limit, it means the corresponding mask - // bit is set. - // to achieve that we find global max value of all inputs of abs(x-y) or - // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set - // otherwise - not - - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(fe, fe); - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - - abs_p0q0 = abs_diff(p1p0, q1q0); - abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); - abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, *thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - // replicate for the further "merged variables" usage - hev = _mm_unpacklo_epi64(hev, hev); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); - - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, *limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // lp filter - the same for 6, 8 and 14 versions - filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); - - // flat_mask4 - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - // replicate for the further "merged variables" usage - flat = _mm_unpacklo_epi64(flat, flat); - } - - // filter8 need it only if flat !=0 - if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { - const __m128i four = _mm_set1_epi16(4); - - __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; - p2_16 = _mm_unpacklo_epi8(*p2, zero); - p1_16 = _mm_unpacklo_epi8(*p1, zero); - p0_16 = _mm_unpacklo_epi8(*p0, zero); - q0_16 = _mm_unpacklo_epi8(*q0, zero); - q1_16 = _mm_unpacklo_epi8(*q1, zero); - q2_16 = _mm_unpacklo_epi8(*q2, zero); - p3_16 = _mm_unpacklo_epi8(*p3, zero); - q3_16 = _mm_unpacklo_epi8(*q3, zero); - - // op2 - workp_a = - _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); - workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); - workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // op1 - workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // op0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); - workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); - - // oq0 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - // oq1 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); - workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); - - // oq2 - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); - workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - - opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); - - work_pq = _mm_andnot_si128(flat, q2p2); - pq2 = _mm_and_si128(flat, opq2); - *p2 = _mm_or_si128(work_pq, pq2); - *q2 = _mm_srli_si128(*p2, 8); - - qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - } -} - -void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0; - __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - __m128i limit = _mm_load_si128((const __m128i *)_limit); - __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - - p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p)); - p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); - p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); - p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); - q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); - q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); - q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); - q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); - - lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &blimit, &limit, &thresh); - - xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); - xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); - xx_storel_32(s - 3 * p, p2); - xx_storel_32(s + 2 * p, q2); -} - -void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i blimit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - __m128i thresh = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - - q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), - _mm_loadl_epi64((__m128i *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); - - q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), - _mm_loadl_epi64((__m128i *)(s + 5 * p))); - - q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), - _mm_loadl_epi64((__m128i *)(s + 6 * p))); - - lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, - &blimit, &limit, &thresh); - - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); -} - -void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), - _mm_load_si128((__m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), - _mm_load_si128((__m128i *)_limit1)); - __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), - _mm_load_si128((__m128i *)_thresh1)); - - __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0; - - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - - lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &blimit, &limit, &thresh); - - _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); - _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); -} - -void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { - __m128i p1, p0, q0, q1; - __m128i qs1qs0, ps1ps0; - - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - - const __m128i zero = _mm_setzero_si128(); - const __m128i blimit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - - __m128i l = _mm_unpacklo_epi64(blimit, limit); - - __m128i thresh0 = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - - __m128i thresh1 = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - - __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - - lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); - - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); - _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); - _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); -} - -void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - __m128i p0, q0, q1, p1; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i qs1qs0, ps1ps0; - - const __m128i zero = _mm_setzero_si128(); - const __m128i blimit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - - __m128i l = _mm_unpacklo_epi64(blimit, limit); - - __m128i thresh0 = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - - __m128i thresh1 = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - - __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - - x0 = _mm_loadl_epi64((__m128i *)((s - 2))); - x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); - x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); - x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); - x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); - x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); - x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); - - transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, - &q1); - - lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); - - p1 = _mm_srli_si128(ps1ps0, 8); - q1 = _mm_srli_si128(qs1qs0, 8); - - transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, - &d5, &d6, &d7); - - xx_storel_32((s - 2 + 0 * p), d0); - xx_storel_32((s - 2 + 1 * p), d1); - xx_storel_32((s - 2 + 2 * p), d2); - xx_storel_32((s - 2 + 3 * p), d3); - xx_storel_32((s - 2 + 4 * p), d4); - xx_storel_32((s - 2 + 5 * p), d5); - xx_storel_32((s - 2 + 6 * p), d6); - xx_storel_32((s - 2 + 7 * p), d7); -} - -void aom_lpf_vertical_6_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i x2, x1, x0, x3; - __m128i p0, q0; - __m128i p1p0, q1q0; - __m128i blimit = _mm_load_si128((__m128i *)_blimit); - __m128i limit = _mm_load_si128((__m128i *)_limit); - __m128i thresh = _mm_load_si128((__m128i *)_thresh); - - x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); - x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); - x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); - x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); - - transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, - &d7); - - lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, - &limit, &thresh); - - p0 = _mm_srli_si128(p1p0, 4); - q0 = _mm_srli_si128(q1q0, 4); - - transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); - - xx_storel_32(s + 0 * p - 2, d0); - xx_storel_32(s + 1 * p - 2, d1); - xx_storel_32(s + 2 * p - 2, d2); - xx_storel_32(s + 3 * p - 2, d3); -} - -void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), - _mm_load_si128((__m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), - _mm_load_si128((__m128i *)_limit1)); - __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), - _mm_load_si128((__m128i *)_thresh1)); - - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i p0, q0; - __m128i p1p0, q1q0; - __m128i d0d1, d2d3, d4d5, d6d7; - - x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); - x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); - x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); - x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); - x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); - x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); - x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); - - transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, - &d6d7); - - d1 = _mm_srli_si128(d0d1, 8); - d3 = _mm_srli_si128(d2d3, 8); - d5 = _mm_srli_si128(d4d5, 8); - d7 = _mm_srli_si128(d6d7, 8); - - lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, - &blimit, &limit, &thresh); - - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); - - transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, - &d6, &d7); - - xx_storel_32((s - 2 + 0 * p), d0); - xx_storel_32((s - 2 + 1 * p), d1); - xx_storel_32((s - 2 + 2 * p), d2); - xx_storel_32((s - 2 + 3 * p), d3); - xx_storel_32((s - 2 + 4 * p), d4); - xx_storel_32((s - 2 + 5 * p), d5); - xx_storel_32((s - 2 + 6 * p), d6); - xx_storel_32((s - 2 + 7 * p), d7); -} - -void aom_lpf_vertical_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i d0, d1, d2, d3, d4, d5, d6, d7; - - __m128i p0, q0; - __m128i x2, x1, x0, x3; - __m128i q1q0, p1p0; - __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - __m128i limit = _mm_load_si128((const __m128i *)_limit); - __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - - x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); - x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); - x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); - x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); - - transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, - &d7); - // Loop filtering - lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, - &blimit, &limit, &thresh); - - p0 = _mm_srli_si128(p1p0, 4); - q0 = _mm_srli_si128(q1q0, 4); - - transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, - &d2, &d3); - - _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); - _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); - _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); - _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); -} - -void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), - _mm_load_si128((__m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), - _mm_load_si128((__m128i *)_limit1)); - __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), - _mm_load_si128((__m128i *)_thresh1)); - - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i d1, d3, d5, d7; - __m128i q1q0, p1p0; - __m128i p1, q1; - __m128i d0d1, d2d3, d4d5, d6d7; - - x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); - x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); - x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); - x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); - x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); - x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); - x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); - - transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, - &d6d7); - - d1 = _mm_srli_si128(d0d1, 8); - d3 = _mm_srli_si128(d2d3, 8); - d5 = _mm_srli_si128(d4d5, 8); - d7 = _mm_srli_si128(d6d7, 8); - - lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, - &q1q0, &p1p0, &blimit, &limit, &thresh); - - p1 = _mm_srli_si128(p1p0, 8); - q1 = _mm_srli_si128(q1q0, 8); - - transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, - &d2d3, &d4d5, &d6d7); - - _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); - _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); - _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); - _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); - _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); - _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); - _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); - _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); -} - -void aom_lpf_vertical_14_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i x6, x5, x4, x3; - __m128i pq0, pq1, pq2, pq3; - __m128i blimit = _mm_load_si128((__m128i *)_blimit); - __m128i limit = _mm_load_si128((__m128i *)_limit); - __m128i thresh = _mm_load_si128((__m128i *)_thresh); - - x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); - x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); - x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); - x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); - - transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, - &q5p5, &q6p6, &q7p7); - - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); - - transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, - &q0p0, &pq0, &pq1, &pq2, &pq3); - _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); - _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); - _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); - _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); -} - -void aom_lpf_vertical_14_dual_sse2( - unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i x7, x6, x5, x4, x3, x2, x1, x0; - __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; - __m128i q0, q1, q2, q3, q7; - __m128i p0p1, p2p3, p4p5, p6p7; - - __m128i blimit = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - __m128i thresh = - _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - - x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); - x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); - x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); - x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); - x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); - x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); - x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); - x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); - - transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, - &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); - - q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); - q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); - q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); - q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); - q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); - q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); - q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); - q7 = _mm_srli_si128(d14d15, 8); - - lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, - &blimit, &limit, &thresh); - - x0 = _mm_srli_si128(q0p0, 8); - x1 = _mm_srli_si128(q1p1, 8); - x2 = _mm_srli_si128(q2p2, 8); - x3 = _mm_srli_si128(q3p3, 8); - x4 = _mm_srli_si128(q4p4, 8); - x5 = _mm_srli_si128(q5p5, 8); - x6 = _mm_srli_si128(q6p6, 8); - - transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, - &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, - &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); - - _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); - _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); - _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); - _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); - _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); - _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); - _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); - _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); -} diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h deleted file mode 100644 index 8970fe7dd..000000000 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ -#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" - -static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *x4, __m128i *x5, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, - __m128i *d4, __m128i *d5) { - __m128i w0, w1, w2, w3, w4, w5, ww0; - - // 00 01 02 03 04 05 xx xx - // 10 11 12 13 14 15 xx xx - // 20 21 22 23 24 25 xx xx - // 30 31 32 33 34 35 xx xx - // 40 41 42 43 44 45 xx xx - // 50 51 52 53 54 55 xx xx - - w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 - w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 - w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 - - ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 - *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 - *d1 = _mm_unpackhi_epi64(ww0, - _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx - - ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 - *d2 = _mm_unpacklo_epi64(ww0, - _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx - - w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx - w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx - w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx - - *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 - - ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 - *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 - *d5 = _mm_unpackhi_epi64(ww0, - _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx -} - -static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3) { - __m128i zero = _mm_setzero_si128(); - __m128i w0, w1, ww0, ww1; - - w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 - w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 - - ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 - ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 - - *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx - *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx - *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx - *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx -} - -static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d4, __m128i *d5, - __m128i *d6, __m128i *d7) { - __m128i w0, w1, ww2, ww3; - __m128i zero = _mm_setzero_si128(); - - w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 - w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 - - ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 - ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 - - *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx - *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx - *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx - *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx -} - -// here in and out pointers (x and d) should be different! we don't store their -// values inside -static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, - __m128i *d4, __m128i *d5, - __m128i *d6, __m128i *d7) { - // input - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // output - // 00 10 20 30 xx xx xx xx - // 01 11 21 31 xx xx xx xx - // 02 12 22 32 xx xx xx xx - // 03 13 23 33 xx xx xx xx - // 04 14 24 34 xx xx xx xx - // 05 15 25 35 xx xx xx xx - // 06 16 26 36 xx xx xx xx - // 07 17 27 37 xx xx xx xx - highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); - highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); -} - -static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3) { - __m128i w0, w1, w2, w3, ww0, ww1; - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - - w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 - w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 - w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 - w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 - - ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 - ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 - - *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 - *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 - - ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 - ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 - - *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 - *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 -} - -static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, - __m128i *d4, __m128i *d5, - __m128i *d6, __m128i *d7) { - __m128i w0, w1, w2, w3, ww0, ww1; - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 - w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 - w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 - w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 - - ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 - ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 - - *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 - *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 - - ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 - ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 - - *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 - *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 -} - -// here in and out pointers (x and d) should be different! we don't store their -// values inside -static INLINE void highbd_transpose8x8_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, - __m128i *d7) { - highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); - highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); -} - -// here in and out pointers (x and d arrays) should be different! we don't store -// their values inside -static INLINE void highbd_transpose8x16_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, - __m128i *d7) { - highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, - d5, d6, d7); - highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, - x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, - d4 + 1, d5 + 1, d6 + 1, d7 + 1); -} - -#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c deleted file mode 100644 index 584b5e7e3..000000000 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/blend.h" -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" - -static INLINE unsigned int masked_sad32xh_avx2( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int width, int height) { - int x, y; - __m256i res = _mm256_setzero_si256(); - const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m256i round_scale = - _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 32) { - const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); - const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); - const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); - const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); - const __m256i m_inv = _mm256_sub_epi8(mask_max, m); - - // Calculate 16 predicted pixels. - // Note that the maximum value of any entry of 'pred_l' or 'pred_r' - // is 64 * 255, so we have plenty of space to add rounding constants. - const __m256i data_l = _mm256_unpacklo_epi8(a, b); - const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); - __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); - pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); - - const __m256i data_r = _mm256_unpackhi_epi8(a, b); - const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); - __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); - pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); - - const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); - res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. - res = _mm256_shuffle_epi32(res, 0xd8); - res = _mm256_permute4x64_epi64(res, 0xd8); - res = _mm256_hadd_epi32(res, res); - res = _mm256_hadd_epi32(res, res); - int32_t sad = _mm256_extract_epi32(res, 0); - return (sad + 31) >> 6; -} - -static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { - __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); - __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); - __m256i a = _mm256_castsi128_si256(a0); - return _mm256_inserti128_si256(a, a1, 1); -} - -static INLINE unsigned int masked_sad16xh_avx2( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { - int y; - __m256i res = _mm256_setzero_si256(); - const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m256i round_scale = - _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - for (y = 0; y < height; y += 2) { - const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); - const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); - const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); - const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); - const __m256i m_inv = _mm256_sub_epi8(mask_max, m); - - // Calculate 16 predicted pixels. - // Note that the maximum value of any entry of 'pred_l' or 'pred_r' - // is 64 * 255, so we have plenty of space to add rounding constants. - const __m256i data_l = _mm256_unpacklo_epi8(a, b); - const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); - __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); - pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); - - const __m256i data_r = _mm256_unpackhi_epi8(a, b); - const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); - __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); - pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); - - const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); - res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); - - src_ptr += src_stride << 1; - a_ptr += a_stride << 1; - b_ptr += b_stride << 1; - m_ptr += m_stride << 1; - } - // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. - res = _mm256_shuffle_epi32(res, 0xd8); - res = _mm256_permute4x64_epi64(res, 0xd8); - res = _mm256_hadd_epi32(res, res); - res = _mm256_hadd_epi32(res, res); - int32_t sad = _mm256_extract_epi32(res, 0); - return (sad + 31) >> 6; -} - -static INLINE unsigned int aom_masked_sad_avx2( - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, - int invert_mask, int m, int n) { - unsigned int sad; - if (!invert_mask) { - switch (m) { - case 4: - sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, - second_pred, m, msk, msk_stride, n); - break; - case 8: - sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, - second_pred, m, msk, msk_stride, n); - break; - case 16: - sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, - m, msk, msk_stride, n); - break; - default: - sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, - m, msk, msk_stride, m, n); - break; - } - } else { - switch (m) { - case 4: - sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, n); - break; - case 8: - sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, n); - break; - case 16: - sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, n); - break; - default: - sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, m, n); - break; - } - } - return sad; -} - -#define MASKSADMXN_AVX2(m, n) \ - unsigned int aom_masked_sad##m##x##n##_avx2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ - msk, msk_stride, invert_mask, m, n); \ - } - -MASKSADMXN_AVX2(4, 4) -MASKSADMXN_AVX2(4, 8) -MASKSADMXN_AVX2(8, 4) -MASKSADMXN_AVX2(8, 8) -MASKSADMXN_AVX2(8, 16) -MASKSADMXN_AVX2(16, 8) -MASKSADMXN_AVX2(16, 16) -MASKSADMXN_AVX2(16, 32) -MASKSADMXN_AVX2(32, 16) -MASKSADMXN_AVX2(32, 32) -MASKSADMXN_AVX2(32, 64) -MASKSADMXN_AVX2(64, 32) -MASKSADMXN_AVX2(64, 64) -MASKSADMXN_AVX2(64, 128) -MASKSADMXN_AVX2(128, 64) -MASKSADMXN_AVX2(128, 128) -MASKSADMXN_AVX2(4, 16) -MASKSADMXN_AVX2(16, 4) -MASKSADMXN_AVX2(8, 32) -MASKSADMXN_AVX2(32, 8) -MASKSADMXN_AVX2(16, 64) -MASKSADMXN_AVX2(64, 16) - -static INLINE unsigned int highbd_masked_sad8xh_avx2( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); - int y; - __m256i res = _mm256_setzero_si256(); - const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m256i round_const = - _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m256i one = _mm256_set1_epi16(1); - - for (y = 0; y < height; y += 2) { - const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); - const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); - const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); - // Zero-extend mask to 16 bits - const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(m_ptr)), - _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); - const __m256i m_inv = _mm256_sub_epi16(mask_max, m); - - const __m256i data_l = _mm256_unpacklo_epi16(a, b); - const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); - __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); - pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m256i data_r = _mm256_unpackhi_epi16(a, b); - const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); - __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); - pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, - // so it is safe to do signed saturation here. - const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); - // There is no 16-bit SAD instruction, so we have to synthesize - // an 8-element SAD. We do this by storing 4 32-bit partial SADs, - // and accumulating them at the end - const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); - res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); - - src_ptr += src_stride << 1; - a_ptr += a_stride << 1; - b_ptr += b_stride << 1; - m_ptr += m_stride << 1; - } - // At this point, we have four 32-bit partial SADs stored in 'res'. - res = _mm256_hadd_epi32(res, res); - res = _mm256_hadd_epi32(res, res); - int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); - return (sad + 31) >> 6; -} - -static INLINE unsigned int highbd_masked_sad16xh_avx2( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int width, int height) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); - int x, y; - __m256i res = _mm256_setzero_si256(); - const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m256i round_const = - _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m256i one = _mm256_set1_epi16(1); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 16) { - const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); - const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); - const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); - // Zero-extend mask to 16 bits - const __m256i m = - _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); - const __m256i m_inv = _mm256_sub_epi16(mask_max, m); - - const __m256i data_l = _mm256_unpacklo_epi16(a, b); - const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); - __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); - pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m256i data_r = _mm256_unpackhi_epi16(a, b); - const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); - __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); - pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, - // so it is safe to do signed saturation here. - const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); - // There is no 16-bit SAD instruction, so we have to synthesize - // an 8-element SAD. We do this by storing 4 32-bit partial SADs, - // and accumulating them at the end - const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); - res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // At this point, we have four 32-bit partial SADs stored in 'res'. - res = _mm256_hadd_epi32(res, res); - res = _mm256_hadd_epi32(res, res); - int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); - return (sad + 31) >> 6; -} - -static INLINE unsigned int aom_highbd_masked_sad_avx2( - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, - int invert_mask, int m, int n) { - unsigned int sad; - if (!invert_mask) { - switch (m) { - case 4: - sad = - aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, - second_pred, m, msk, msk_stride, n); - break; - case 8: - sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, - second_pred, m, msk, msk_stride, n); - break; - default: - sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, - second_pred, m, msk, msk_stride, m, n); - break; - } - } else { - switch (m) { - case 4: - sad = - aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, n); - break; - case 8: - sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, n); - break; - default: - sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, - ref_stride, msk, msk_stride, m, n); - break; - } - } - return sad; -} - -#define HIGHBD_MASKSADMXN_AVX2(m, n) \ - unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ - int msk_stride, int invert_mask) { \ - return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ - second_pred8, msk, msk_stride, \ - invert_mask, m, n); \ - } - -HIGHBD_MASKSADMXN_AVX2(4, 4); -HIGHBD_MASKSADMXN_AVX2(4, 8); -HIGHBD_MASKSADMXN_AVX2(8, 4); -HIGHBD_MASKSADMXN_AVX2(8, 8); -HIGHBD_MASKSADMXN_AVX2(8, 16); -HIGHBD_MASKSADMXN_AVX2(16, 8); -HIGHBD_MASKSADMXN_AVX2(16, 16); -HIGHBD_MASKSADMXN_AVX2(16, 32); -HIGHBD_MASKSADMXN_AVX2(32, 16); -HIGHBD_MASKSADMXN_AVX2(32, 32); -HIGHBD_MASKSADMXN_AVX2(32, 64); -HIGHBD_MASKSADMXN_AVX2(64, 32); -HIGHBD_MASKSADMXN_AVX2(64, 64); -HIGHBD_MASKSADMXN_AVX2(64, 128); -HIGHBD_MASKSADMXN_AVX2(128, 64); -HIGHBD_MASKSADMXN_AVX2(128, 128); -HIGHBD_MASKSADMXN_AVX2(4, 16); -HIGHBD_MASKSADMXN_AVX2(16, 4); -HIGHBD_MASKSADMXN_AVX2(8, 32); -HIGHBD_MASKSADMXN_AVX2(32, 8); -HIGHBD_MASKSADMXN_AVX2(16, 64); -HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c deleted file mode 100644 index 493f9bd8f..000000000 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdio.h> -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/blend.h" -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms.h" - -#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" - -// For width a multiple of 16 -static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, - int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int width, int height); - -#define MASKSADMXN_SSSE3(m, n) \ - unsigned int aom_masked_sad##m##x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - if (!invert_mask) \ - return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - m, msk, msk_stride, m, n); \ - else \ - return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ - ref_stride, msk, msk_stride, m, n); \ - } - -#define MASKSAD8XN_SSSE3(n) \ - unsigned int aom_masked_sad8x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - if (!invert_mask) \ - return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 8, msk, msk_stride, n); \ - else \ - return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ - ref_stride, msk, msk_stride, n); \ - } - -#define MASKSAD4XN_SSSE3(n) \ - unsigned int aom_masked_sad4x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - if (!invert_mask) \ - return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 4, msk, msk_stride, n); \ - else \ - return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ - ref_stride, msk, msk_stride, n); \ - } - -MASKSADMXN_SSSE3(128, 128) -MASKSADMXN_SSSE3(128, 64) -MASKSADMXN_SSSE3(64, 128) -MASKSADMXN_SSSE3(64, 64) -MASKSADMXN_SSSE3(64, 32) -MASKSADMXN_SSSE3(32, 64) -MASKSADMXN_SSSE3(32, 32) -MASKSADMXN_SSSE3(32, 16) -MASKSADMXN_SSSE3(16, 32) -MASKSADMXN_SSSE3(16, 16) -MASKSADMXN_SSSE3(16, 8) -MASKSAD8XN_SSSE3(16) -MASKSAD8XN_SSSE3(8) -MASKSAD8XN_SSSE3(4) -MASKSAD4XN_SSSE3(8) -MASKSAD4XN_SSSE3(4) -MASKSAD4XN_SSSE3(16) -MASKSADMXN_SSSE3(16, 4) -MASKSAD8XN_SSSE3(32) -MASKSADMXN_SSSE3(32, 8) -MASKSADMXN_SSSE3(16, 64) -MASKSADMXN_SSSE3(64, 16) - -static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, - int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int width, int height) { - int x, y; - __m128i res = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 16) { - const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); - const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); - const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); - const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); - const __m128i m_inv = _mm_sub_epi8(mask_max, m); - - // Calculate 16 predicted pixels. - // Note that the maximum value of any entry of 'pred_l' or 'pred_r' - // is 64 * 255, so we have plenty of space to add rounding constants. - const __m128i data_l = _mm_unpacklo_epi8(a, b); - const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); - __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); - pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi8(a, b); - const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); - __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); - pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); - - const __m128i pred = _mm_packus_epi16(pred_l, pred_r); - res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. - int32_t sad = - _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); - return (sad + 31) >> 6; -} - -unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height) { - int y; - __m128i res = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - - for (y = 0; y < height; y += 2) { - const __m128i src = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)src_ptr), - _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); - const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); - const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); - const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); - const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); - const __m128i m = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), - _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); - const __m128i m_inv = _mm_sub_epi8(mask_max, m); - - const __m128i data_l = _mm_unpacklo_epi8(a0, b0); - const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); - __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); - pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpacklo_epi8(a1, b1); - const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); - __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); - pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); - - const __m128i pred = _mm_packus_epi16(pred_l, pred_r); - res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); - - src_ptr += src_stride * 2; - a_ptr += a_stride * 2; - b_ptr += b_stride * 2; - m_ptr += m_stride * 2; - } - int32_t sad = - _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); - return (sad + 31) >> 6; -} - -unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height) { - int y; - __m128i res = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - - for (y = 0; y < height; y += 2) { - // Load two rows at a time, this seems to be a bit faster - // than four rows at a time in this case. - const __m128i src = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(uint32_t *)src_ptr), - _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); - const __m128i a = - _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), - _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); - const __m128i b = - _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), - _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); - const __m128i m = - _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), - _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); - const __m128i m_inv = _mm_sub_epi8(mask_max, m); - - const __m128i data = _mm_unpacklo_epi8(a, b); - const __m128i mask = _mm_unpacklo_epi8(m, m_inv); - __m128i pred_16bit = _mm_maddubs_epi16(data, mask); - pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); - - const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); - res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); - - src_ptr += src_stride * 2; - a_ptr += a_stride * 2; - b_ptr += b_stride * 2; - m_ptr += m_stride * 2; - } - // At this point, the SAD is stored in lane 0 of 'res' - int32_t sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; -} - -// For width a multiple of 8 -static INLINE unsigned int highbd_masked_sad_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int width, int height); - -#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ - unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ - int msk_stride, int invert_mask) { \ - if (!invert_mask) \ - return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ - second_pred8, m, msk, msk_stride, m, n); \ - else \ - return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ - ref_stride, msk, msk_stride, m, n); \ - } - -#define HIGHBD_MASKSAD4XN_SSSE3(n) \ - unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ - int msk_stride, int invert_mask) { \ - if (!invert_mask) \ - return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ - ref_stride, second_pred8, 4, msk, \ - msk_stride, n); \ - else \ - return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ - ref8, ref_stride, msk, msk_stride, \ - n); \ - } - -HIGHBD_MASKSADMXN_SSSE3(128, 128) -HIGHBD_MASKSADMXN_SSSE3(128, 64) -HIGHBD_MASKSADMXN_SSSE3(64, 128) -HIGHBD_MASKSADMXN_SSSE3(64, 64) -HIGHBD_MASKSADMXN_SSSE3(64, 32) -HIGHBD_MASKSADMXN_SSSE3(32, 64) -HIGHBD_MASKSADMXN_SSSE3(32, 32) -HIGHBD_MASKSADMXN_SSSE3(32, 16) -HIGHBD_MASKSADMXN_SSSE3(16, 32) -HIGHBD_MASKSADMXN_SSSE3(16, 16) -HIGHBD_MASKSADMXN_SSSE3(16, 8) -HIGHBD_MASKSADMXN_SSSE3(8, 16) -HIGHBD_MASKSADMXN_SSSE3(8, 8) -HIGHBD_MASKSADMXN_SSSE3(8, 4) -HIGHBD_MASKSAD4XN_SSSE3(8) -HIGHBD_MASKSAD4XN_SSSE3(4) -HIGHBD_MASKSAD4XN_SSSE3(16) -HIGHBD_MASKSADMXN_SSSE3(16, 4) -HIGHBD_MASKSADMXN_SSSE3(8, 32) -HIGHBD_MASKSADMXN_SSSE3(32, 8) -HIGHBD_MASKSADMXN_SSSE3(16, 64) -HIGHBD_MASKSADMXN_SSSE3(64, 16) - -static INLINE unsigned int highbd_masked_sad_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int width, int height) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); - int x, y; - __m128i res = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i round_const = - _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m128i one = _mm_set1_epi16(1); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 8) { - const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); - const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); - const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); - // Zero-extend mask to 16 bits - const __m128i m = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); - const __m128i m_inv = _mm_sub_epi16(mask_max, m); - - const __m128i data_l = _mm_unpacklo_epi16(a, b); - const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); - __m128i pred_l = _mm_madd_epi16(data_l, mask_l); - pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi16(a, b); - const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); - __m128i pred_r = _mm_madd_epi16(data_r, mask_r); - pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, - // so it is safe to do signed saturation here. - const __m128i pred = _mm_packs_epi32(pred_l, pred_r); - // There is no 16-bit SAD instruction, so we have to synthesize - // an 8-element SAD. We do this by storing 4 32-bit partial SADs, - // and accumulating them at the end - const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); - res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // At this point, we have four 32-bit partial SADs stored in 'res'. - res = _mm_hadd_epi32(res, res); - res = _mm_hadd_epi32(res, res); - int sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; -} - -unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, - const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); - int y; - __m128i res = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i round_const = - _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m128i one = _mm_set1_epi16(1); - - for (y = 0; y < height; y += 2) { - const __m128i src = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)src_ptr), - _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); - const __m128i a = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), - _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); - const __m128i b = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), - _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); - // Zero-extend mask to 16 bits - const __m128i m = _mm_unpacklo_epi8( - _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), - _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), - _mm_setzero_si128()); - const __m128i m_inv = _mm_sub_epi16(mask_max, m); - - const __m128i data_l = _mm_unpacklo_epi16(a, b); - const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); - __m128i pred_l = _mm_madd_epi16(data_l, mask_l); - pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi16(a, b); - const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); - __m128i pred_r = _mm_madd_epi16(data_r, mask_r); - pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i pred = _mm_packs_epi32(pred_l, pred_r); - const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); - res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); - - src_ptr += src_stride * 2; - a_ptr += a_stride * 2; - b_ptr += b_stride * 2; - m_ptr += m_stride * 2; - } - res = _mm_hadd_epi32(res, res); - res = _mm_hadd_epi32(res, res); - int sad = _mm_cvtsi128_si32(res); - return (sad + 31) >> 6; -} diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h deleted file mode 100644 index cffbd9672..000000000 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ -#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ - -unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height); - -unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height); - -unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, - const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m_ptr, int m_stride, - int height); - -#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c deleted file mode 100644 index d7dbefd7d..000000000 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ /dev/null @@ -1,1064 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <stdlib.h> -#include <string.h> -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/blend.h" -#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_ports/mem.h" - -// For width a multiple of 16 -static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int w, int h); - -static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int h); - -static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int h); - -// For width a multiple of 16 -static void masked_variance(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int width, - int height, unsigned int *sse, int *sum_); - -static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, const uint8_t *b_ptr, - const uint8_t *m_ptr, int m_stride, int height, - unsigned int *sse, int *sum_); - -static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, const uint8_t *b_ptr, - const uint8_t *m_ptr, int m_stride, int height, - unsigned int *sse, int *sum_); - -#define MASK_SUBPIX_VAR_SSSE3(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - int sum; \ - uint8_t temp[(H + 1) * W]; \ - \ - bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ - \ - if (!invert_mask) \ - masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ - msk_stride, W, H, sse, &sum); \ - else \ - masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ - msk_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } - -#define MASK_SUBPIX_VAR8XH_SSSE3(H) \ - unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - int sum; \ - uint8_t temp[(H + 1) * 8]; \ - \ - bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ - \ - if (!invert_mask) \ - masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ - H, sse, &sum); \ - else \ - masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ - H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ - } - -#define MASK_SUBPIX_VAR4XH_SSSE3(H) \ - unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - int sum; \ - uint8_t temp[(H + 1) * 4]; \ - \ - bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ - \ - if (!invert_mask) \ - masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ - H, sse, &sum); \ - else \ - masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ - H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ - } - -MASK_SUBPIX_VAR_SSSE3(128, 128) -MASK_SUBPIX_VAR_SSSE3(128, 64) -MASK_SUBPIX_VAR_SSSE3(64, 128) -MASK_SUBPIX_VAR_SSSE3(64, 64) -MASK_SUBPIX_VAR_SSSE3(64, 32) -MASK_SUBPIX_VAR_SSSE3(32, 64) -MASK_SUBPIX_VAR_SSSE3(32, 32) -MASK_SUBPIX_VAR_SSSE3(32, 16) -MASK_SUBPIX_VAR_SSSE3(16, 32) -MASK_SUBPIX_VAR_SSSE3(16, 16) -MASK_SUBPIX_VAR_SSSE3(16, 8) -MASK_SUBPIX_VAR8XH_SSSE3(16) -MASK_SUBPIX_VAR8XH_SSSE3(8) -MASK_SUBPIX_VAR8XH_SSSE3(4) -MASK_SUBPIX_VAR4XH_SSSE3(8) -MASK_SUBPIX_VAR4XH_SSSE3(4) -MASK_SUBPIX_VAR4XH_SSSE3(16) -MASK_SUBPIX_VAR_SSSE3(16, 4) -MASK_SUBPIX_VAR8XH_SSSE3(32) -MASK_SUBPIX_VAR_SSSE3(32, 8) -MASK_SUBPIX_VAR_SSSE3(64, 16) -MASK_SUBPIX_VAR_SSSE3(16, 64) - -static INLINE __m128i filter_block(const __m128i a, const __m128i b, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi8(a, b); - v0 = _mm_maddubs_epi16(v0, filter); - v0 = xx_roundn_epu16(v0, FILTER_BITS); - - __m128i v1 = _mm_unpackhi_epi8(a, b); - v1 = _mm_maddubs_epi16(v1, filter); - v1 = xx_roundn_epu16(v1, FILTER_BITS); - - return _mm_packus_epi16(v0, v1); -} - -static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int w, int h) { - int i, j; - // Horizontal filter - if (xoffset == 0) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 16) { - __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - _mm_storeu_si128((__m128i *)&b[j], x); - } - src += src_stride; - b += w; - } - } else if (xoffset == 4) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 16) { - __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); - __m128i z = _mm_alignr_epi8(y, x, 1); - _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); - } - src += src_stride; - b += w; - } - } else { - uint8_t *b = dst; - const uint8_t *hfilter = bilinear_filters_2t[xoffset]; - const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 16) { - const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); - const __m128i z = _mm_alignr_epi8(y, x, 1); - const __m128i res = filter_block(x, z, hfilter_vec); - _mm_storeu_si128((__m128i *)&b[j], res); - } - - src += src_stride; - b += w; - } - } - - // Vertical filter - if (yoffset == 0) { - // The data is already in 'dst', so no need to filter - } else if (yoffset == 4) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); - __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); - _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); - } - dst += w; - } - } else { - const uint8_t *vfilter = bilinear_filters_2t[yoffset]; - const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); - const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); - const __m128i res = filter_block(x, y, vfilter_vec); - _mm_storeu_si128((__m128i *)&dst[j], res); - } - - dst += w; - } - } -} - -static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0, - const __m128i a1, const __m128i b1, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi8(a0, b0); - v0 = _mm_maddubs_epi16(v0, filter); - v0 = xx_roundn_epu16(v0, FILTER_BITS); - - __m128i v1 = _mm_unpacklo_epi8(a1, b1); - v1 = _mm_maddubs_epi16(v1, filter); - v1 = xx_roundn_epu16(v1, FILTER_BITS); - - return _mm_packus_epi16(v0, v1); -} - -static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int h) { - int i; - // Horizontal filter - if (xoffset == 0) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = _mm_loadl_epi64((__m128i *)src); - _mm_storel_epi64((__m128i *)b, x); - src += src_stride; - b += 8; - } - } else if (xoffset == 4) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = _mm_loadu_si128((__m128i *)src); - __m128i z = _mm_srli_si128(x, 1); - _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); - src += src_stride; - b += 8; - } - } else { - uint8_t *b = dst; - const uint8_t *hfilter = bilinear_filters_2t[xoffset]; - const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); - for (i = 0; i < h; i += 2) { - const __m128i x0 = _mm_loadu_si128((__m128i *)src); - const __m128i z0 = _mm_srli_si128(x0, 1); - const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); - const __m128i z1 = _mm_srli_si128(x1, 1); - const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec); - _mm_storeu_si128((__m128i *)b, res); - - src += src_stride * 2; - b += 16; - } - // Handle i = h separately - const __m128i x0 = _mm_loadu_si128((__m128i *)src); - const __m128i z0 = _mm_srli_si128(x0, 1); - - __m128i v0 = _mm_unpacklo_epi8(x0, z0); - v0 = _mm_maddubs_epi16(v0, hfilter_vec); - v0 = xx_roundn_epu16(v0, FILTER_BITS); - - _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); - } - - // Vertical filter - if (yoffset == 0) { - // The data is already in 'dst', so no need to filter - } else if (yoffset == 4) { - for (i = 0; i < h; ++i) { - __m128i x = _mm_loadl_epi64((__m128i *)dst); - __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); - _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); - dst += 8; - } - } else { - const uint8_t *vfilter = bilinear_filters_2t[yoffset]; - const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); - for (i = 0; i < h; i += 2) { - const __m128i x = _mm_loadl_epi64((__m128i *)dst); - const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); - const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); - const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec); - _mm_storeu_si128((__m128i *)dst, res); - - dst += 16; - } - } -} - -static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, - int yoffset, uint8_t *dst, int h) { - int i; - // Horizontal filter - if (xoffset == 0) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = xx_loadl_32((__m128i *)src); - xx_storel_32((__m128i *)b, x); - src += src_stride; - b += 4; - } - } else if (xoffset == 4) { - uint8_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = _mm_loadl_epi64((__m128i *)src); - __m128i z = _mm_srli_si128(x, 1); - xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z)); - src += src_stride; - b += 4; - } - } else { - uint8_t *b = dst; - const uint8_t *hfilter = bilinear_filters_2t[xoffset]; - const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); - for (i = 0; i < h; i += 4) { - const __m128i x0 = _mm_loadl_epi64((__m128i *)src); - const __m128i z0 = _mm_srli_si128(x0, 1); - const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); - const __m128i z1 = _mm_srli_si128(x1, 1); - const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); - const __m128i z2 = _mm_srli_si128(x2, 1); - const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); - const __m128i z3 = _mm_srli_si128(x3, 1); - - const __m128i a0 = _mm_unpacklo_epi32(x0, x1); - const __m128i b0 = _mm_unpacklo_epi32(z0, z1); - const __m128i a1 = _mm_unpacklo_epi32(x2, x3); - const __m128i b1 = _mm_unpacklo_epi32(z2, z3); - const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec); - _mm_storeu_si128((__m128i *)b, res); - - src += src_stride * 4; - b += 16; - } - // Handle i = h separately - const __m128i x = _mm_loadl_epi64((__m128i *)src); - const __m128i z = _mm_srli_si128(x, 1); - - __m128i v0 = _mm_unpacklo_epi8(x, z); - v0 = _mm_maddubs_epi16(v0, hfilter_vec); - v0 = xx_roundn_epu16(v0, FILTER_BITS); - - xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0)); - } - - // Vertical filter - if (yoffset == 0) { - // The data is already in 'dst', so no need to filter - } else if (yoffset == 4) { - for (i = 0; i < h; ++i) { - __m128i x = xx_loadl_32((__m128i *)dst); - __m128i y = xx_loadl_32((__m128i *)&dst[4]); - xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y)); - dst += 4; - } - } else { - const uint8_t *vfilter = bilinear_filters_2t[yoffset]; - const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); - for (i = 0; i < h; i += 4) { - const __m128i a = xx_loadl_32((__m128i *)dst); - const __m128i b = xx_loadl_32((__m128i *)&dst[4]); - const __m128i c = xx_loadl_32((__m128i *)&dst[8]); - const __m128i d = xx_loadl_32((__m128i *)&dst[12]); - const __m128i e = xx_loadl_32((__m128i *)&dst[16]); - - const __m128i a0 = _mm_unpacklo_epi32(a, b); - const __m128i b0 = _mm_unpacklo_epi32(b, c); - const __m128i a1 = _mm_unpacklo_epi32(c, d); - const __m128i b1 = _mm_unpacklo_epi32(d, e); - const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec); - _mm_storeu_si128((__m128i *)dst, res); - - dst += 16; - } - } -} - -static INLINE void accumulate_block(const __m128i src, const __m128i a, - const __m128i b, const __m128i m, - __m128i *sum, __m128i *sum_sq) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i m_inv = _mm_sub_epi8(mask_max, m); - - // Calculate 16 predicted pixels. - // Note that the maximum value of any entry of 'pred_l' or 'pred_r' - // is 64 * 255, so we have plenty of space to add rounding constants. - const __m128i data_l = _mm_unpacklo_epi8(a, b); - const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); - __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); - pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi8(a, b); - const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); - __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); - pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); - - const __m128i src_l = _mm_unpacklo_epi8(src, zero); - const __m128i src_r = _mm_unpackhi_epi8(src, zero); - const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); - const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); - - // Update partial sums and partial sums of squares - *sum = - _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); - *sum_sq = - _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), - _mm_madd_epi16(diff_r, diff_r))); -} - -static void masked_variance(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int width, - int height, unsigned int *sse, int *sum_) { - int x, y; - __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 16) { - const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); - const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); - const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); - const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); - accumulate_block(src, a, b, m, &sum, &sum_sq); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // Reduce down to a single sum and sum of squares - sum = _mm_hadd_epi32(sum, sum_sq); - sum = _mm_hadd_epi32(sum, sum); - *sum_ = _mm_cvtsi128_si32(sum); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); -} - -static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, const uint8_t *b_ptr, - const uint8_t *m_ptr, int m_stride, int height, - unsigned int *sse, int *sum_) { - int y; - __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); - - for (y = 0; y < height; y += 2) { - __m128i src = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)src_ptr), - _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); - const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); - const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); - const __m128i m = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), - _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); - accumulate_block(src, a, b, m, &sum, &sum_sq); - - src_ptr += src_stride * 2; - a_ptr += 16; - b_ptr += 16; - m_ptr += m_stride * 2; - } - // Reduce down to a single sum and sum of squares - sum = _mm_hadd_epi32(sum, sum_sq); - sum = _mm_hadd_epi32(sum, sum); - *sum_ = _mm_cvtsi128_si32(sum); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); -} - -static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, - const uint8_t *a_ptr, const uint8_t *b_ptr, - const uint8_t *m_ptr, int m_stride, int height, - unsigned int *sse, int *sum_) { - int y; - __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); - - for (y = 0; y < height; y += 4) { - // Load four rows at a time - __m128i src = - _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride], - *(uint32_t *)&src_ptr[src_stride * 2], - *(uint32_t *)&src_ptr[src_stride * 3]); - const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); - const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); - const __m128i m = _mm_setr_epi32( - *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride], - *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]); - accumulate_block(src, a, b, m, &sum, &sum_sq); - - src_ptr += src_stride * 4; - a_ptr += 16; - b_ptr += 16; - m_ptr += m_stride * 4; - } - // Reduce down to a single sum and sum of squares - sum = _mm_hadd_epi32(sum, sum_sq); - sum = _mm_hadd_epi32(sum, sum); - *sum_ = _mm_cvtsi128_si32(sum); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); -} - -// For width a multiple of 8 -static void highbd_bilinear_filter(const uint16_t *src, int src_stride, - int xoffset, int yoffset, uint16_t *dst, - int w, int h); - -static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, - int xoffset, int yoffset, uint16_t *dst, - int h); - -// For width a multiple of 8 -static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, - const uint16_t *a_ptr, int a_stride, - const uint16_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int width, int height, uint64_t *sse, - int *sum_); - -static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, - const uint16_t *a_ptr, - const uint16_t *b_ptr, - const uint8_t *m_ptr, int m_stride, - int height, int *sse, int *sum_); - -#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ - unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - uint64_t sse64; \ - int sum; \ - uint16_t temp[(H + 1) * W]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - else \ - highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - *sse = (uint32_t)sse64; \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - uint64_t sse64; \ - int sum; \ - int64_t var; \ - uint16_t temp[(H + 1) * W]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - else \ - highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ - sum = ROUND_POWER_OF_TWO(sum, 2); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - uint64_t sse64; \ - int sum; \ - int64_t var; \ - uint16_t temp[(H + 1) * W]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - else \ - highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ - msk_stride, W, H, &sse64, &sum); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ - sum = ROUND_POWER_OF_TWO(sum, 4); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ - unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - int sse_; \ - int sum; \ - uint16_t temp[(H + 1) * 4]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ - msk_stride, H, &sse_, &sum); \ - else \ - highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ - msk_stride, H, &sse_, &sum); \ - *sse = (uint32_t)sse_; \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ - } \ - unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - int sse_; \ - int sum; \ - int64_t var; \ - uint16_t temp[(H + 1) * 4]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ - msk_stride, H, &sse_, &sum); \ - else \ - highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ - msk_stride, H, &sse_, &sum); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ - sum = ROUND_POWER_OF_TWO(sum, 2); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ - const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ - int sse_; \ - int sum; \ - int64_t var; \ - uint16_t temp[(H + 1) * 4]; \ - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ - \ - highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ - \ - if (!invert_mask) \ - highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ - msk_stride, H, &sse_, &sum); \ - else \ - highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ - msk_stride, H, &sse_, &sum); \ - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ - sum = ROUND_POWER_OF_TWO(sum, 4); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) -HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) -HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) -HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) - -static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi16(a, b); - v0 = _mm_madd_epi16(v0, filter); - v0 = xx_roundn_epu32(v0, FILTER_BITS); - - __m128i v1 = _mm_unpackhi_epi16(a, b); - v1 = _mm_madd_epi16(v1, filter); - v1 = xx_roundn_epu32(v1, FILTER_BITS); - - return _mm_packs_epi32(v0, v1); -} - -static void highbd_bilinear_filter(const uint16_t *src, int src_stride, - int xoffset, int yoffset, uint16_t *dst, - int w, int h) { - int i, j; - // Horizontal filter - if (xoffset == 0) { - uint16_t *b = dst; - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 8) { - __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - _mm_storeu_si128((__m128i *)&b[j], x); - } - src += src_stride; - b += w; - } - } else if (xoffset == 4) { - uint16_t *b = dst; - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 8) { - __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); - __m128i z = _mm_alignr_epi8(y, x, 2); - _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); - } - src += src_stride; - b += w; - } - } else { - uint16_t *b = dst; - const uint8_t *hfilter = bilinear_filters_2t[xoffset]; - const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); - for (i = 0; i < h + 1; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); - const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); - const __m128i z = _mm_alignr_epi8(y, x, 2); - const __m128i res = highbd_filter_block(x, z, hfilter_vec); - _mm_storeu_si128((__m128i *)&b[j], res); - } - - src += src_stride; - b += w; - } - } - - // Vertical filter - if (yoffset == 0) { - // The data is already in 'dst', so no need to filter - } else if (yoffset == 4) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); - __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); - _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); - } - dst += w; - } - } else { - const uint8_t *vfilter = bilinear_filters_2t[yoffset]; - const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); - const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); - const __m128i res = highbd_filter_block(x, y, vfilter_vec); - _mm_storeu_si128((__m128i *)&dst[j], res); - } - - dst += w; - } - } -} - -static INLINE __m128i highbd_filter_block_2rows(const __m128i a0, - const __m128i b0, - const __m128i a1, - const __m128i b1, - const __m128i filter) { - __m128i v0 = _mm_unpacklo_epi16(a0, b0); - v0 = _mm_madd_epi16(v0, filter); - v0 = xx_roundn_epu32(v0, FILTER_BITS); - - __m128i v1 = _mm_unpacklo_epi16(a1, b1); - v1 = _mm_madd_epi16(v1, filter); - v1 = xx_roundn_epu32(v1, FILTER_BITS); - - return _mm_packs_epi32(v0, v1); -} - -static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, - int xoffset, int yoffset, uint16_t *dst, - int h) { - int i; - // Horizontal filter - if (xoffset == 0) { - uint16_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = _mm_loadl_epi64((__m128i *)src); - _mm_storel_epi64((__m128i *)b, x); - src += src_stride; - b += 4; - } - } else if (xoffset == 4) { - uint16_t *b = dst; - for (i = 0; i < h + 1; ++i) { - __m128i x = _mm_loadu_si128((__m128i *)src); - __m128i z = _mm_srli_si128(x, 2); - _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); - src += src_stride; - b += 4; - } - } else { - uint16_t *b = dst; - const uint8_t *hfilter = bilinear_filters_2t[xoffset]; - const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); - for (i = 0; i < h; i += 2) { - const __m128i x0 = _mm_loadu_si128((__m128i *)src); - const __m128i z0 = _mm_srli_si128(x0, 2); - const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); - const __m128i z1 = _mm_srli_si128(x1, 2); - const __m128i res = - highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec); - _mm_storeu_si128((__m128i *)b, res); - - src += src_stride * 2; - b += 8; - } - // Process i = h separately - __m128i x = _mm_loadu_si128((__m128i *)src); - __m128i z = _mm_srli_si128(x, 2); - - __m128i v0 = _mm_unpacklo_epi16(x, z); - v0 = _mm_madd_epi16(v0, hfilter_vec); - v0 = xx_roundn_epu32(v0, FILTER_BITS); - - _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); - } - - // Vertical filter - if (yoffset == 0) { - // The data is already in 'dst', so no need to filter - } else if (yoffset == 4) { - for (i = 0; i < h; ++i) { - __m128i x = _mm_loadl_epi64((__m128i *)dst); - __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); - _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); - dst += 4; - } - } else { - const uint8_t *vfilter = bilinear_filters_2t[yoffset]; - const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); - for (i = 0; i < h; i += 2) { - const __m128i x = _mm_loadl_epi64((__m128i *)dst); - const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); - const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); - const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec); - _mm_storeu_si128((__m128i *)dst, res); - - dst += 8; - } - } -} - -static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, - const uint16_t *a_ptr, int a_stride, - const uint16_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, - int width, int height, uint64_t *sse, - int *sum_) { - int x, y; - // Note on bit widths: - // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, - // so this can be kept as four 32-bit values. - // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, - // so this must be stored as two 64-bit values. - __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i round_const = - _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m128i zero = _mm_setzero_si128(); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x += 8) { - const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); - const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); - const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); - const __m128i m = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); - const __m128i m_inv = _mm_sub_epi16(mask_max, m); - - // Calculate 8 predicted pixels. - const __m128i data_l = _mm_unpacklo_epi16(a, b); - const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); - __m128i pred_l = _mm_madd_epi16(data_l, mask_l); - pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi16(a, b); - const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); - __m128i pred_r = _mm_madd_epi16(data_r, mask_r); - pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i src_l = _mm_unpacklo_epi16(src, zero); - const __m128i src_r = _mm_unpackhi_epi16(src, zero); - __m128i diff_l = _mm_sub_epi32(pred_l, src_l); - __m128i diff_r = _mm_sub_epi32(pred_r, src_r); - - // Update partial sums and partial sums of squares - sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); - // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit - // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. - // So we can re-pack into 16-bit fields and use _mm_madd_epi16 - // to calculate the squares and partially sum them. - const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); - const __m128i prod = _mm_madd_epi16(tmp, tmp); - // Then we want to sign-extend to 64 bits and accumulate - const __m128i sign = _mm_srai_epi32(prod, 31); - const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); - const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); - sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); - } - - src_ptr += src_stride; - a_ptr += a_stride; - b_ptr += b_stride; - m_ptr += m_stride; - } - // Reduce down to a single sum and sum of squares - sum = _mm_hadd_epi32(sum, zero); - sum = _mm_hadd_epi32(sum, zero); - *sum_ = _mm_cvtsi128_si32(sum); - sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); - _mm_storel_epi64((__m128i *)sse, sum_sq); -} - -static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, - const uint16_t *a_ptr, - const uint16_t *b_ptr, - const uint8_t *m_ptr, int m_stride, - int height, int *sse, int *sum_) { - int y; - // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). - // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 - // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. - // So we can safely pack sum_sq into 32-bit fields, which is slightly more - // convenient. - __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); - const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i round_const = - _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m128i zero = _mm_setzero_si128(); - - for (y = 0; y < height; y += 2) { - __m128i src = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)src_ptr), - _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); - const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); - const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); - const __m128i m = _mm_unpacklo_epi8( - _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), - _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), - zero); - const __m128i m_inv = _mm_sub_epi16(mask_max, m); - - const __m128i data_l = _mm_unpacklo_epi16(a, b); - const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); - __m128i pred_l = _mm_madd_epi16(data_l, mask_l); - pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i data_r = _mm_unpackhi_epi16(a, b); - const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); - __m128i pred_r = _mm_madd_epi16(data_r, mask_r); - pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i src_l = _mm_unpacklo_epi16(src, zero); - const __m128i src_r = _mm_unpackhi_epi16(src, zero); - __m128i diff_l = _mm_sub_epi32(pred_l, src_l); - __m128i diff_r = _mm_sub_epi32(pred_r, src_r); - - // Update partial sums and partial sums of squares - sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); - const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); - const __m128i prod = _mm_madd_epi16(tmp, tmp); - sum_sq = _mm_add_epi32(sum_sq, prod); - - src_ptr += src_stride * 2; - a_ptr += 8; - b_ptr += 8; - m_ptr += m_stride * 2; - } - // Reduce down to a single sum and sum of squares - sum = _mm_hadd_epi32(sum, sum_sq); - sum = _mm_hadd_epi32(sum, zero); - *sum_ = _mm_cvtsi128_si32(sum); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); -} - -void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { - const uint8_t *src0 = invert_mask ? pred : ref; - const uint8_t *src1 = invert_mask ? ref : pred; - const int stride0 = invert_mask ? width : ref_stride; - const int stride1 = invert_mask ? ref_stride : width; - assert(height % 2 == 0); - int i = 0; - if (width == 8) { - comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, - mask, mask_stride); - } else if (width == 16) { - do { - comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); - comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, - mask + mask_stride, comp_pred + width); - comp_pred += (width << 1); - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - i += 2; - } while (i < height); - } else { // width == 32 - assert(width == 32); - do { - comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); - comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16); - comp_pred += (width); - src0 += (stride0); - src1 += (stride1); - mask += (mask_stride); - i += 1; - } while (i < height); - } -} diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h deleted file mode 100644 index 4faa098ac..000000000 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ -#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ - -#include <stdlib.h> -#include <string.h> -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/blend.h" - -static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, - const uint8_t *src1, - const uint8_t *mask, uint8_t *dst) { - const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i round_offset = - _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - - const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); - const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); - const __m128i aA = _mm_load_si128((const __m128i *)(mask)); - - const __m128i maA = _mm_sub_epi8(alpha_max, aA); - - const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); - const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); - const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); - const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); - - const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); - const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); - - const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); - const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); - _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); -} - -static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, - const uint8_t *src0, int stride0, - const uint8_t *src1, int stride1, - const uint8_t *mask, - int mask_stride) { - int i = 0; - const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const __m128i round_offset = - _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); - do { - // odd line A - const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); - const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); - const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); - // even line B - const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); - const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); - const __m128i a = _mm_castps_si128(_mm_loadh_pi( - _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); - - const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); - const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); - - const __m128i ma = _mm_sub_epi8(alpha_max, a); - const __m128i aaA = _mm_unpacklo_epi8(a, ma); - const __m128i aaB = _mm_unpackhi_epi8(a, ma); - - const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); - const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); - const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); - const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); - const __m128i round = _mm_packus_epi16(roundA, roundB); - // comp_pred's stride == width == 8 - _mm_store_si128((__m128i *)(comp_pred), round); - comp_pred += (8 << 1); - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - i += 2; - } while (i < height); -} - -#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h deleted file mode 100644 index 6c821673e..000000000 --- a/third_party/aom/aom_dsp/x86/mem_sse2.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ -#define AOM_AOM_DSP_X86_MEM_SSE2_H_ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { - return _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); -} - -static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, - const int byte_stride) { - return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), - *(const int32_t *)((int8_t *)src + 1 * byte_stride), - *(const int32_t *)((int8_t *)src + 2 * byte_stride), - *(const int32_t *)((int8_t *)src + 3 * byte_stride)); -} - -static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, - const int byte_stride) { - __m128i dst; - dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); - dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); - return dst; -} - -#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h deleted file mode 100644 index 5181e444c..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ -#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ - -#include <smmintrin.h> - -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" - -static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int h) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); - const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); - const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - -#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h deleted file mode 100644 index 48486c6c4..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ -#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ - -#include <immintrin.h> - -#include "config/aom_config.h" - -static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { - v_d = _mm_hadd_epi32(v_d, v_d); - v_d = _mm_hadd_epi32(v_d, v_d); - return _mm_cvtsi128_si32(v_d); -} - -static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { - v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); -#if ARCH_X86_64 - return _mm_cvtsi128_si64(v_q); -#else - { - int64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_q); - return tmp; - } -#endif -} - -static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { - const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); - const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); - const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); - return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); -} - -// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) -static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); - const __m128i v_tmp_d = - _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - -#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c deleted file mode 100644 index 2aa2a0555..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" -#include "aom_dsp/x86/synonyms.h" - -//////////////////////////////////////////////////////////////////////////////// -// 8 bit -//////////////////////////////////////////////////////////////////////////////// - -static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { - int n = 0; - __m256i v_sad_d = _mm256_setzero_si256(); - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - - do { - const __m128i v_p_b_0 = xx_loadl_32(pre); - const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); - const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); - const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); - const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); - - const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); - - const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); - const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); - - // Rounded absolute difference - const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); - const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); - - v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); - - n += 8; - pre += pre_stride << 1; - } while (n < 8 * (height >> 1)); - - __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); - __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); - v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); - return xx_hsum_epi32_si32(v_sad_d_0); -} - -static INLINE unsigned int obmc_sad_w8n_avx2( - const uint8_t *pre, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, const int width, const int height) { - const int pre_step = pre_stride - width; - int n = 0; - __m256i v_sad_d = _mm256_setzero_si256(); - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - assert(width >= 8); - assert(IS_POWER_OF_TWO(width)); - - do { - const __m128i v_p0_b = xx_loadl_64(pre + n); - const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); - const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); - - const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); - - const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); - const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); - - // Rounded absolute difference - const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); - const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); - - v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); - - n += 8; - - if ((n & (width - 1)) == 0) pre += pre_step; - } while (n < width * height); - - __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); - __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); - v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); - return xx_hsum_epi32_si32(v_sad_d_0); -} - -#define OBMCSADWXH(w, h) \ - unsigned int aom_obmc_sad##w##x##h##_avx2( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *msk) { \ - if (w == 4) { \ - return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ - } else { \ - return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ - } \ - } - -OBMCSADWXH(128, 128) -OBMCSADWXH(128, 64) -OBMCSADWXH(64, 128) -OBMCSADWXH(64, 64) -OBMCSADWXH(64, 32) -OBMCSADWXH(32, 64) -OBMCSADWXH(32, 32) -OBMCSADWXH(32, 16) -OBMCSADWXH(16, 32) -OBMCSADWXH(16, 16) -OBMCSADWXH(16, 8) -OBMCSADWXH(8, 16) -OBMCSADWXH(8, 8) -OBMCSADWXH(8, 4) -OBMCSADWXH(4, 8) -OBMCSADWXH(4, 4) -OBMCSADWXH(4, 16) -OBMCSADWXH(16, 4) -OBMCSADWXH(8, 32) -OBMCSADWXH(32, 8) -OBMCSADWXH(16, 64) -OBMCSADWXH(64, 16) - -//////////////////////////////////////////////////////////////////////////////// -// High bit-depth -//////////////////////////////////////////////////////////////////////////////// - -static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - int n = 0; - __m256i v_sad_d = _mm256_setzero_si256(); - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - do { - const __m128i v_p_w_0 = xx_loadl_64(pre); - const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); - const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); - const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); - const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); - - const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); - - const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); - const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); - - // Rounded absolute difference - - const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); - const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); - - v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); - - n += 8; - - pre += pre_stride << 1; - } while (n < 8 * (height >> 1)); - - __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); - __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); - v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); - return xx_hsum_epi32_si32(v_sad_d_0); -} - -static INLINE unsigned int hbd_obmc_sad_w8n_avx2( - const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, const int width, const int height) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - const int pre_step = pre_stride - width; - int n = 0; - __m256i v_sad_d = _mm256_setzero_si256(); - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - - assert(width >= 8); - assert(IS_POWER_OF_TWO(width)); - - do { - const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); - const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); - const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); - - const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); - - const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); - const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); - - // Rounded absolute difference - const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); - const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); - - v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); - - n += 8; - - if (n % width == 0) pre += pre_step; - } while (n < width * height); - - __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); - __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); - v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); - return xx_hsum_epi32_si32(v_sad_d_0); -} - -#define HBD_OBMCSADWXH(w, h) \ - unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask) { \ - if (w == 4) { \ - return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ - } else { \ - return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ - } \ - } - -HBD_OBMCSADWXH(128, 128) -HBD_OBMCSADWXH(128, 64) -HBD_OBMCSADWXH(64, 128) -HBD_OBMCSADWXH(64, 64) -HBD_OBMCSADWXH(64, 32) -HBD_OBMCSADWXH(32, 64) -HBD_OBMCSADWXH(32, 32) -HBD_OBMCSADWXH(32, 16) -HBD_OBMCSADWXH(16, 32) -HBD_OBMCSADWXH(16, 16) -HBD_OBMCSADWXH(16, 8) -HBD_OBMCSADWXH(8, 16) -HBD_OBMCSADWXH(8, 8) -HBD_OBMCSADWXH(8, 4) -HBD_OBMCSADWXH(4, 8) -HBD_OBMCSADWXH(4, 4) -HBD_OBMCSADWXH(4, 16) -HBD_OBMCSADWXH(16, 4) -HBD_OBMCSADWXH(8, 32) -HBD_OBMCSADWXH(32, 8) -HBD_OBMCSADWXH(16, 64) -HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c deleted file mode 100644 index 0338a8c77..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" -#include "aom_dsp/x86/synonyms.h" - -//////////////////////////////////////////////////////////////////////////////// -// 8 bit -//////////////////////////////////////////////////////////////////////////////// - -static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sad_d = _mm_setzero_si128(); - - do { - const __m128i v_p_b = xx_loadl_32(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); - - // Rounded absolute difference - const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); - - v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * height); - - return xx_hsum_epi32_si32(v_sad_d); -} - -static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( - const uint8_t *pre, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, const int width, const int height) { - const int pre_step = pre_stride - width; - int n = 0; - __m128i v_sad_d = _mm_setzero_si128(); - - assert(width >= 8); - assert(IS_POWER_OF_TWO(width)); - - do { - const __m128i v_p1_b = xx_loadl_32(pre + n + 4); - const __m128i v_m1_d = xx_load_128(mask + n + 4); - const __m128i v_w1_d = xx_load_128(wsrc + n + 4); - const __m128i v_p0_b = xx_loadl_32(pre + n); - const __m128i v_m0_d = xx_load_128(mask + n); - const __m128i v_w0_d = xx_load_128(wsrc + n); - - const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); - const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); - const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - - const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); - const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); - const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); - - // Rounded absolute difference - const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); - const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); - - v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); - v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); - - n += 8; - - if (n % width == 0) pre += pre_step; - } while (n < width * height); - - return xx_hsum_epi32_si32(v_sad_d); -} - -#define OBMCSADWXH(w, h) \ - unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *msk) { \ - if (w == 4) { \ - return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ - } else { \ - return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ - } \ - } - -OBMCSADWXH(128, 128) -OBMCSADWXH(128, 64) -OBMCSADWXH(64, 128) -OBMCSADWXH(64, 64) -OBMCSADWXH(64, 32) -OBMCSADWXH(32, 64) -OBMCSADWXH(32, 32) -OBMCSADWXH(32, 16) -OBMCSADWXH(16, 32) -OBMCSADWXH(16, 16) -OBMCSADWXH(16, 8) -OBMCSADWXH(8, 16) -OBMCSADWXH(8, 8) -OBMCSADWXH(8, 4) -OBMCSADWXH(4, 8) -OBMCSADWXH(4, 4) -OBMCSADWXH(4, 16) -OBMCSADWXH(16, 4) -OBMCSADWXH(8, 32) -OBMCSADWXH(32, 8) -OBMCSADWXH(16, 64) -OBMCSADWXH(64, 16) - -//////////////////////////////////////////////////////////////////////////////// -// High bit-depth -//////////////////////////////////////////////////////////////////////////////// - -static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sad_d = _mm_setzero_si128(); - - do { - const __m128i v_p_w = xx_loadl_64(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); - - // Rounded absolute difference - const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); - - v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * height); - - return xx_hsum_epi32_si32(v_sad_d); -} - -static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( - const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, const int width, const int height) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - const int pre_step = pre_stride - width; - int n = 0; - __m128i v_sad_d = _mm_setzero_si128(); - - assert(width >= 8); - assert(IS_POWER_OF_TWO(width)); - - do { - const __m128i v_p1_w = xx_loadl_64(pre + n + 4); - const __m128i v_m1_d = xx_load_128(mask + n + 4); - const __m128i v_w1_d = xx_load_128(wsrc + n + 4); - const __m128i v_p0_w = xx_loadl_64(pre + n); - const __m128i v_m0_d = xx_load_128(mask + n); - const __m128i v_w0_d = xx_load_128(wsrc + n); - - const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); - const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); - const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - - const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); - const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); - const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); - - // Rounded absolute difference - const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); - const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); - - v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); - v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); - - n += 8; - - if (n % width == 0) pre += pre_step; - } while (n < width * height); - - return xx_hsum_epi32_si32(v_sad_d); -} - -#define HBD_OBMCSADWXH(w, h) \ - unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask) { \ - if (w == 4) { \ - return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ - } else { \ - return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ - } \ - } - -HBD_OBMCSADWXH(128, 128) -HBD_OBMCSADWXH(128, 64) -HBD_OBMCSADWXH(64, 128) -HBD_OBMCSADWXH(64, 64) -HBD_OBMCSADWXH(64, 32) -HBD_OBMCSADWXH(32, 64) -HBD_OBMCSADWXH(32, 32) -HBD_OBMCSADWXH(32, 16) -HBD_OBMCSADWXH(16, 32) -HBD_OBMCSADWXH(16, 16) -HBD_OBMCSADWXH(16, 8) -HBD_OBMCSADWXH(8, 16) -HBD_OBMCSADWXH(8, 8) -HBD_OBMCSADWXH(8, 4) -HBD_OBMCSADWXH(4, 8) -HBD_OBMCSADWXH(4, 4) -HBD_OBMCSADWXH(4, 16) -HBD_OBMCSADWXH(16, 4) -HBD_OBMCSADWXH(8, 32) -HBD_OBMCSADWXH(32, 8) -HBD_OBMCSADWXH(16, 64) -HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c deleted file mode 100644 index bfec0e8a8..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_sse4.h" - -//////////////////////////////////////////////////////////////////////////////// -// 8 bit -//////////////////////////////////////////////////////////////////////////////// - -static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int w, const int h) { - int n = 0, width, height = h; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - __m128i v_d; - const uint8_t *pre_temp; - assert(w >= 8); - assert(IS_POWER_OF_TWO(w)); - assert(IS_POWER_OF_TWO(h)); - do { - width = w; - pre_temp = pre; - do { - const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); - const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); - const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); - const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); - const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); - - const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); - const __m256i v_tmp_d = - _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); - const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); - const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); - const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); - - const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); - const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - pre_temp += 8; - n += 8; - width -= 8; - } while (width > 0); - pre += pre_stride; - height -= 1; - } while (height > 0); - v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); - v_d = _mm_hadd_epi32(v_d, v_d); - *sum = _mm_cvtsi128_si32(v_d); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); -} - -static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int w, const int h) { - int n = 0, width, height = h; - __m256i v_d; - __m128i res0; - const uint8_t *pre_temp; - const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); - __m256i v_sum_d = _mm256_setzero_si256(); - __m256i v_sse_d = _mm256_setzero_si256(); - - assert(w >= 16); - assert(IS_POWER_OF_TWO(w)); - assert(IS_POWER_OF_TWO(h)); - do { - width = w; - pre_temp = pre; - do { - const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); - const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); - const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); - const __m256i v_m1_d = - _mm256_loadu_si256((__m256i const *)(mask + n + 8)); - const __m256i v_w1_d = - _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); - - const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); - const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); - - const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); - const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); - - const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); - const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); - - const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); - const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); - - const __m256i v_tmp0_d = - _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); - const __m256i v_tmp1_d = - _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); - - const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); - const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); - - const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); - const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); - const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); - - v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); - v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); - - pre_temp += 16; - n += 16; - width -= 16; - } while (width > 0); - pre += pre_stride; - height -= 1; - } while (height > 0); - - v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); - v_d = _mm256_hadd_epi32(v_d, v_d); - res0 = _mm256_castsi256_si128(v_d); - res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); - *sum = _mm_cvtsi128_si32(res0); - *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); -} - -#define OBMCVARWXH(W, H) \ - unsigned int aom_obmc_variance##W##x##H##_avx2( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - if (W == 4) { \ - obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ - } else if (W == 8) { \ - obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ - } else { \ - obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ - } \ - \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } - -OBMCVARWXH(128, 128) -OBMCVARWXH(128, 64) -OBMCVARWXH(64, 128) -OBMCVARWXH(64, 64) -OBMCVARWXH(64, 32) -OBMCVARWXH(32, 64) -OBMCVARWXH(32, 32) -OBMCVARWXH(32, 16) -OBMCVARWXH(16, 32) -OBMCVARWXH(16, 16) -OBMCVARWXH(16, 8) -OBMCVARWXH(8, 16) -OBMCVARWXH(8, 8) -OBMCVARWXH(8, 4) -OBMCVARWXH(4, 8) -OBMCVARWXH(4, 4) -OBMCVARWXH(4, 16) -OBMCVARWXH(16, 4) -OBMCVARWXH(8, 32) -OBMCVARWXH(32, 8) -OBMCVARWXH(16, 64) -OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c deleted file mode 100644 index 72eda0e57..000000000 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_sse4.h" -#include "aom_dsp/x86/synonyms.h" - -//////////////////////////////////////////////////////////////////////////////// -// 8 bit -//////////////////////////////////////////////////////////////////////////////// - -void aom_var_filter_block2d_bil_first_pass_ssse3( - const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -void aom_var_filter_block2d_bil_second_pass_ssse3( - const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter); - -static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int w, const int h) { - const int pre_step = pre_stride - w; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(w >= 8); - assert(IS_POWER_OF_TWO(w)); - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p1_b = xx_loadl_32(pre + n + 4); - const __m128i v_m1_d = xx_load_128(mask + n + 4); - const __m128i v_w1_d = xx_load_128(wsrc + n + 4); - const __m128i v_p0_b = xx_loadl_32(pre + n); - const __m128i v_m0_d = xx_load_128(mask + n); - const __m128i v_w0_d = xx_load_128(wsrc + n); - - const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); - const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); - const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - - const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); - - const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); - const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); - const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); - const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 8; - - if (n % w == 0) pre += pre_step; - } while (n < w * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - -#define OBMCVARWXH(W, H) \ - unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - if (W == 4) { \ - obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ - } else { \ - obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ - } \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } - -OBMCVARWXH(128, 128) -OBMCVARWXH(128, 64) -OBMCVARWXH(64, 128) -OBMCVARWXH(64, 64) -OBMCVARWXH(64, 32) -OBMCVARWXH(32, 64) -OBMCVARWXH(32, 32) -OBMCVARWXH(32, 16) -OBMCVARWXH(16, 32) -OBMCVARWXH(16, 16) -OBMCVARWXH(16, 8) -OBMCVARWXH(8, 16) -OBMCVARWXH(8, 8) -OBMCVARWXH(8, 4) -OBMCVARWXH(4, 8) -OBMCVARWXH(4, 4) -OBMCVARWXH(4, 16) -OBMCVARWXH(16, 4) -OBMCVARWXH(8, 32) -OBMCVARWXH(32, 8) -OBMCVARWXH(16, 64) -OBMCVARWXH(64, 16) - -#include "config/aom_dsp_rtcd.h" - -#define OBMC_SUBPIX_VAR(W, H) \ - uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_ssse3( \ - pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_ssse3( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ - } - -OBMC_SUBPIX_VAR(128, 128) -OBMC_SUBPIX_VAR(128, 64) -OBMC_SUBPIX_VAR(64, 128) -OBMC_SUBPIX_VAR(64, 64) -OBMC_SUBPIX_VAR(64, 32) -OBMC_SUBPIX_VAR(32, 64) -OBMC_SUBPIX_VAR(32, 32) -OBMC_SUBPIX_VAR(32, 16) -OBMC_SUBPIX_VAR(16, 32) -OBMC_SUBPIX_VAR(16, 16) -OBMC_SUBPIX_VAR(16, 8) -OBMC_SUBPIX_VAR(8, 16) -OBMC_SUBPIX_VAR(8, 8) -OBMC_SUBPIX_VAR(8, 4) -OBMC_SUBPIX_VAR(4, 8) -OBMC_SUBPIX_VAR(4, 4) -OBMC_SUBPIX_VAR(4, 16) -OBMC_SUBPIX_VAR(16, 4) -OBMC_SUBPIX_VAR(8, 32) -OBMC_SUBPIX_VAR(32, 8) -OBMC_SUBPIX_VAR(16, 64) -OBMC_SUBPIX_VAR(64, 16) - -//////////////////////////////////////////////////////////////////////////////// -// High bit-depth -//////////////////////////////////////////////////////////////////////////////// - -static INLINE void hbd_obmc_variance_w4( - const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_w = xx_loadl_64(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - -static INLINE void hbd_obmc_variance_w8n( - const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, - const int h) { - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - const int pre_step = pre_stride - w; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(w >= 8); - assert(IS_POWER_OF_TWO(w)); - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p1_w = xx_loadl_64(pre + n + 4); - const __m128i v_m1_d = xx_load_128(mask + n + 4); - const __m128i v_w1_d = xx_load_128(wsrc + n + 4); - const __m128i v_p0_w = xx_loadl_64(pre + n); - const __m128i v_m0_d = xx_load_128(mask + n); - const __m128i v_w0_d = xx_load_128(wsrc + n); - - const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); - const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); - const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); - - const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); - const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); - - const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); - const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); - const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); - const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 8; - - if (n % w == 0) pre += pre_step; - } while (n < w * h); - - *sum += xx_hsum_epi32_si64(v_sum_d); - *sse += xx_hsum_epi32_si64(v_sse_d); -} - -static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64 = 0; - uint64_t sse64 = 0; - if (w == 4) { - hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); - } - *sum = (int)sum64; - *sse = (unsigned int)sse64; -} - -static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64 = 0; - uint64_t sse64 = 0; - if (w == 4) { - hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else if (w < 128 || h < 128) { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); - } else { - assert(w == 128 && h == 128); - - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, - 64); - pre8 += 64 * pre_stride; - wsrc += 64 * w; - mask += 64 * w; - h -= 64; - } while (h > 0); - } - *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); -} - -static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64 = 0; - uint64_t sse64 = 0; - int max_pel_allowed_per_ovf = 512; - if (w == 4) { - hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else if (w * h <= max_pel_allowed_per_ovf) { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); - } else { - int h_per_ovf = max_pel_allowed_per_ovf / w; - - assert(max_pel_allowed_per_ovf % w == 0); - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, - h_per_ovf); - pre8 += h_per_ovf * pre_stride; - wsrc += h_per_ovf * w; - mask += h_per_ovf * w; - h -= h_per_ovf; - } while (h > 0); - } - *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); -} - -#define HBD_OBMCVARWXH(W, H) \ - unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ - const int32_t *mask, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } - -HBD_OBMCVARWXH(128, 128) -HBD_OBMCVARWXH(128, 64) -HBD_OBMCVARWXH(64, 128) -HBD_OBMCVARWXH(64, 64) -HBD_OBMCVARWXH(64, 32) -HBD_OBMCVARWXH(32, 64) -HBD_OBMCVARWXH(32, 32) -HBD_OBMCVARWXH(32, 16) -HBD_OBMCVARWXH(16, 32) -HBD_OBMCVARWXH(16, 16) -HBD_OBMCVARWXH(16, 8) -HBD_OBMCVARWXH(8, 16) -HBD_OBMCVARWXH(8, 8) -HBD_OBMCVARWXH(8, 4) -HBD_OBMCVARWXH(4, 8) -HBD_OBMCVARWXH(4, 4) -HBD_OBMCVARWXH(4, 16) -HBD_OBMCVARWXH(16, 4) -HBD_OBMCVARWXH(8, 32) -HBD_OBMCVARWXH(32, 8) -HBD_OBMCVARWXH(16, 64) -HBD_OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm deleted file mode 100644 index 216a0bd8f..000000000 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ /dev/null @@ -1,435 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - vzeroupper - -%ifnidn %1, b_32x32 - - ; Special case for ncoeff == 16, as it is frequent and we can save on - ; not setting up a loop. - cmp ncoeffmp, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Special case of ncoeff == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.single: - - movifnidn coeffq, coeffmp - movifnidn zbinq, zbinmp - mova m0, [zbinq] ; m0 = zbin - - ; Get DC and first 15 AC coeffs - in this special case, that is all. - ; coeff stored as 32bit numbers but we process them as 16 bit numbers - mova m9, [coeffq] - packssdw m9, [coeffq+16] ; m9 = c[i] - mova m10, [coeffq+32] - packssdw m10, [coeffq+48] ; m10 = c[i] - - mov r0, eobmp ; Output pointer - mov r1, qcoeffmp ; Output pointer - mov r2, dqcoeffmp ; Output pointer - - pxor m5, m5 ; m5 = dedicated zero - - pcmpeqw m4, m4 ; All word lanes -1 - paddw m0, m4 ; m0 = zbin - 1 - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, we just write zeros - ; to the outputs and we are done. - por m14, m7, m12 - ptest m14, m14 - jnz .single_nonzero - - mova [r1 ], ymm5 - mova [r1+32], ymm5 - mova [r2 ], ymm5 - mova [r2+32], ymm5 - mov [r0], word 0 - - vzeroupper - RET - -.single_nonzero: - - ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r3, roundmp - movifnidn r4, quantmp - mov r6, dequantmp - mov r5, shiftmp - mova m1, [r3] ; m1 = round - mova m2, [r4] ; m2 = quant - mova m3, [r6] ; m3 = dequant - mova m4, [r5] ; m4 = shift - - mov r3, iscanmp - - DEFINE_ARGS eob, qcoeff, dqcoeff, iscan - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq ], m11 - mova [qcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+32], m11 - mova [qcoeffq+48], m6 - - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q - - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq ], m11 - mova [dqcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+32], m11 - mova [dqcoeffq+48], m6 - - mova m6, [iscanq] ; m6 = scan[i] - mova m11, [iscanq+16] ; m11 = scan[i] - - pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 - psubw m6, m6, m7 ; m6 = scan[i] + 1 - psubw m11, m11, m12 ; m11 = scan[i] + 1 - pandn m8, m8, m6 ; m8 = max(eob) - pandn m13, m13, m11 ; m13 = max(eob) - pmaxsw m8, m8, m13 - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [eobq], ax - - vzeroupper - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of ncoeff != 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.generic: - -%endif ; %ifnidn %1, b_32x32 - -DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - ; Actual quantization loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - movifnidn dequantq, dequantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mova m3, [dequantq] ; m3 = dequant - pcmpeqw m4, m4 ; All lanes -1 -%ifidn %1, b_32x32 - psubw m0, m4 - psubw m1, m4 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - paddw m0, m4 ; m0 = m0 + 1 - - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob - - - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip forward quickly. - por m14, m7, m12 - ptest m14, m14 - jnz .first_nonzero - - mova [qcoeffq+ncoeffq*4 ], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4 ], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 - add ncoeffq, mmsize - - punpckhqdq m1, m1 - punpckhqdq m2, m2 - punpckhqdq m3, m3 - punpckhqdq m4, m4 - pxor m8, m8 - - jmp .ac_only_loop - -.first_nonzero: - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - -.ac_only_loop: - - ; pack coeff from 32bit to 16bit array - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip this itertion. - ; And just write zeros as the result would be. - por m14, m7, m12 - ptest m14, m14 - jnz .rest_nonzero - - mova [qcoeffq+ncoeffq*4+ 0], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4+ 0], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 - - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - -.rest_nonzero: - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET -%endmacro - -INIT_XMM avx -QUANTIZE_FN b, 9 -QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c deleted file mode 100644 index d3de6e24d..000000000 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> -#include <xmmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/x86/quantize_x86.h" - -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -} - -void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - const __m128i zero = _mm_setzero_si128(); - int index = 16; - - __m128i zbin, round, quant, dequant, shift; - __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i cmp_mask0, cmp_mask1; - __m128i eob, eob0; - - (void)scan_ptr; - - // Setup global values. - load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, - dequant_ptr, &dequant, quant_shift_ptr, &shift); - - // Do DC and first 15 AC. - coeff0 = load_coefficients(coeff_ptr); - coeff1 = load_coefficients(coeff_ptr + 8); - - // Poor man's abs(). - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - - calculate_qcoeff(&qcoeff0, round, quant, shift); - - round = _mm_unpackhi_epi64(round, round); - quant = _mm_unpackhi_epi64(quant, quant); - shift = _mm_unpackhi_epi64(shift, shift); - - calculate_qcoeff(&qcoeff1, round, quant, shift); - - // Reinsert signs - qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr); - store_coefficients(qcoeff1, qcoeff_ptr + 8); - - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr); - store_coefficients(coeff1, dqcoeff_ptr + 8); - - eob = - scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); - - // AC only loop. - while (index < n_coeffs) { - coeff0 = load_coefficients(coeff_ptr + index); - coeff1 = load_coefficients(coeff_ptr + index + 8); - - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - - calculate_qcoeff(&qcoeff0, round, quant, shift); - calculate_qcoeff(&qcoeff1, round, quant, shift); - - qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); - qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + index); - store_coefficients(qcoeff1, qcoeff_ptr + index + 8); - - coeff0 = calculate_dqcoeff(qcoeff0, dequant); - coeff1 = calculate_dqcoeff(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + index); - store_coefficients(coeff1, dqcoeff_ptr + index + 8); - - eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, - index, zero); - eob = _mm_max_epi16(eob, eob0); - - index += 16; - } - - *eob_ptr = accumulate_eob(eob); -} diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm deleted file mode 100644 index 39d4ca674..000000000 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,272 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - movifnidn dequantq, dequantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m0, m5 - paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [dequantq] ; m3 = dequant - mov r2, shiftmp - psubw m0, [GLOBAL(pw_1)] - mova m4, [r2] ; m4 = shift - mov r3, qcoeffmp - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register - -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: - ; pack coeff from 32bit to 16bit array - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin -%ifidn %1, b_32x32 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - or r6, r2 - jz .skip_iter -%endif - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pxor m11, m11 - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register - -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 - - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - -%ifidn %1, b_32x32 - jmp .accumulate_eob -.skip_iter: - mova [qcoeffq+ncoeffq*4+ 0], m5 - mova [qcoeffq+ncoeffq*4+16], m5 - mova [qcoeffq+ncoeffq*4+32], m5 - mova [qcoeffq+ncoeffq*4+48], m5 - mova [dqcoeffq+ncoeffq*4+ 0], m5 - mova [dqcoeffq+ncoeffq*4+16], m5 - mova [dqcoeffq+ncoeffq*4+32], m5 - mova [dqcoeffq+ncoeffq*4+48], m5 - add ncoeffq, mmsize - jl .ac_only_loop -%endif - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FN b, 9 -QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h deleted file mode 100644 index 4eed7dd29..000000000 --- a/third_party/aom/aom_dsp/x86/quantize_x86.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "aom/aom_integer.h" - -static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, - const int16_t *round_ptr, __m128i *round, - const int16_t *quant_ptr, __m128i *quant, - const int16_t *dequant_ptr, __m128i *dequant, - const int16_t *shift_ptr, __m128i *shift) { - *zbin = _mm_load_si128((const __m128i *)zbin_ptr); - *round = _mm_load_si128((const __m128i *)round_ptr); - *quant = _mm_load_si128((const __m128i *)quant_ptr); - *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); - *dequant = _mm_load_si128((const __m128i *)dequant_ptr); - *shift = _mm_load_si128((const __m128i *)shift_ptr); -} - -// With ssse3 and later abs() and sign() are preferred. -static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { - a = _mm_xor_si128(a, sign); - return _mm_sub_epi16(a, sign); -} - -static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, - const __m128i quant, const __m128i shift) { - __m128i tmp, qcoeff; - qcoeff = _mm_adds_epi16(*coeff, round); - tmp = _mm_mulhi_epi16(qcoeff, quant); - qcoeff = _mm_add_epi16(tmp, qcoeff); - *coeff = _mm_mulhi_epi16(qcoeff, shift); -} - -static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { - return _mm_mullo_epi16(qcoeff, dequant); -} - -// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing -// to zbin to add 1 to the index in 'scan'. -static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, - const __m128i zbin_mask0, - const __m128i zbin_mask1, - const int16_t *scan_ptr, const int index, - const __m128i zero) { - const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); - const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); - __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); - __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); - __m128i eob0, eob1; - // Add one to convert from indices to counts - scan0 = _mm_sub_epi16(scan0, zbin_mask0); - scan1 = _mm_sub_epi16(scan1, zbin_mask1); - eob0 = _mm_andnot_si128(zero_coeff0, scan0); - eob1 = _mm_andnot_si128(zero_coeff1, scan1); - return _mm_max_epi16(eob0, eob1); -} - -static INLINE int16_t accumulate_eob(__m128i eob) { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - return _mm_extract_epi16(eob, 1); -} diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c deleted file mode 100644 index f662b62b1..000000000 --- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <immintrin.h> // AVX2 - -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 32; i++) { - // load src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } - _mm256_zeroupper(); -} - -void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; - __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; - __m256i ref3_reg, ref3next_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 64; i++) { - // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); - ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); - ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); - ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); - - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - { - __m128i sum; - - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } - _mm256_zeroupper(); -} - -void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += src_stride << 5; - rf[0] += ref_stride << 5; - rf[1] += ref_stride << 5; - rf[2] += ref_stride << 5; - rf[3] += ref_stride << 5; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} - -void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; - unsigned int half_width = 32; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += half_width; - rf[0] += half_width; - rf[1] += half_width; - rf[2] += half_width; - rf[3] += half_width; - aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm deleted file mode 100644 index 55a856985..000000000 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ /dev/null @@ -1,257 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_4x2x4 5-6 0 - movd m0, [srcq +%2] -%if %1 == 1 - movd m6, [ref1q+%3] - movd m4, [ref2q+%3] - movd m7, [ref3q+%3] - movd m5, [ref4q+%3] - movd m1, [srcq +%4] - movd m2, [ref1q+%5] - punpckldq m0, m1 - punpckldq m6, m2 - movd m1, [ref2q+%5] - movd m2, [ref3q+%5] - movd m3, [ref4q+%5] - punpckldq m4, m1 - punpckldq m7, m2 - punpckldq m5, m3 - movlhps m0, m0 - movlhps m6, m4 - movlhps m7, m5 - psadbw m6, m0 - psadbw m7, m0 -%else - movd m1, [ref1q+%3] - movd m5, [ref1q+%5] - movd m2, [ref2q+%3] - movd m4, [ref2q+%5] - punpckldq m1, m5 - punpckldq m2, m4 - movd m3, [ref3q+%3] - movd m5, [ref3q+%5] - punpckldq m3, m5 - movd m4, [ref4q+%3] - movd m5, [ref4q+%5] - punpckldq m4, m5 - movd m5, [srcq +%4] - punpckldq m0, m5 - movlhps m0, m0 - movlhps m1, m2 - movlhps m3, m4 - psadbw m1, m0 - psadbw m3, m0 - paddd m6, m1 - paddd m7, m3 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif -%endmacro - -; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_8x2x4 5-6 0 - movh m0, [srcq +%2] -%if %1 == 1 - movh m4, [ref1q+%3] - movh m5, [ref2q+%3] - movh m6, [ref3q+%3] - movh m7, [ref4q+%3] - movhps m0, [srcq +%4] - movhps m4, [ref1q+%5] - movhps m5, [ref2q+%5] - movhps m6, [ref3q+%5] - movhps m7, [ref4q+%5] - psadbw m4, m0 - psadbw m5, m0 - psadbw m6, m0 - psadbw m7, m0 -%else - movh m1, [ref1q+%3] - movh m2, [ref2q+%3] - movh m3, [ref3q+%3] - movhps m0, [srcq +%4] - movhps m1, [ref1q+%5] - movhps m2, [ref2q+%5] - movhps m3, [ref3q+%5] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movh m1, [ref4q+%3] - movhps m1, [ref4q+%5] - paddd m5, m2 - paddd m6, m3 - psadbw m1, m0 - paddd m7, m1 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif -%endmacro - -; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_16x2x4 5-6 0 - ; 1st 16 px - mova m0, [srcq +%2] -%if %1 == 1 - movu m4, [ref1q+%3] - movu m5, [ref2q+%3] - movu m6, [ref3q+%3] - movu m7, [ref4q+%3] - psadbw m4, m0 - psadbw m5, m0 - psadbw m6, m0 - psadbw m7, m0 -%else - movu m1, [ref1q+%3] - movu m2, [ref2q+%3] - movu m3, [ref3q+%3] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movu m1, [ref4q+%3] - paddd m5, m2 - paddd m6, m3 - psadbw m1, m0 - paddd m7, m1 -%endif - - ; 2nd 16 px - mova m0, [srcq +%4] - movu m1, [ref1q+%5] - movu m2, [ref2q+%5] - movu m3, [ref3q+%5] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movu m1, [ref4q+%5] - paddd m5, m2 - paddd m6, m3 -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif - psadbw m1, m0 - paddd m7, m1 -%endmacro - -; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_32x2x4 5-6 0 - PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 - PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 -%endmacro - -; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_64x2x4 5-6 0 - PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 - PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 -%endmacro - -; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_128x2x4 5-6 0 - PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64 - PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6 -%endmacro - -; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, -; uint8_t *ref[4], int ref_stride, -; uint32_t res[4]); -; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 -%if UNIX64 -cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ - res, ref2, ref3, ref4 -%else -cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ - ref2, ref3, ref4 -%endif - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov ref2q, [ref1q+gprsize*1] - mov ref3q, [ref1q+gprsize*2] - mov ref4q, [ref1q+gprsize*3] - mov ref1q, [ref1q+gprsize*0] - - PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 -%endrep - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 - -%if %1 > 4 - pslldq m5, 4 - pslldq m7, 4 - por m4, m5 - por m6, m7 - mova m5, m4 - mova m7, m6 - punpcklqdq m4, m6 - punpckhqdq m5, m7 - movifnidn r4, r4mp - paddd m4, m5 - movu [r4], m4 - RET -%else - movifnidn r4, r4mp - pshufd m6, m6, 0x08 - pshufd m7, m7, 0x08 - movq [r4+0], m6 - movq [r4+8], m7 - RET -%endif -%endmacro - -INIT_XMM sse2 -SADNXN4D 128, 128 -SADNXN4D 128, 64 -SADNXN4D 64, 128 -SADNXN4D 64, 64 -SADNXN4D 64, 32 -SADNXN4D 32, 64 -SADNXN4D 32, 32 -SADNXN4D 32, 16 -SADNXN4D 16, 32 -SADNXN4D 16, 16 -SADNXN4D 16, 8 -SADNXN4D 8, 16 -SADNXN4D 8, 8 -SADNXN4D 8, 4 -SADNXN4D 4, 8 -SADNXN4D 4, 4 -SADNXN4D 4, 16 -SADNXN4D 16, 4 -SADNXN4D 8, 32 -SADNXN4D 32, 8 -SADNXN4D 16, 64 -SADNXN4D 64, 16 diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c deleted file mode 100644 index a50dba64a..000000000 --- a/third_party/aom/aom_dsp/x86/sad_avx2.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" - -#define FSAD64_H(h) \ - unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - _mm256_zeroupper(); \ - return res; \ - } - -#define FSAD32_H(h) \ - unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - _mm256_zeroupper(); \ - return res; \ - } - -#define FSAD64 \ - FSAD64_H(64); \ - FSAD64_H(32); - -#define FSAD32 \ - FSAD32_H(64); \ - FSAD32_H(32); \ - FSAD32_H(16); - -/* clang-format off */ -FSAD64 -FSAD32 -/* clang-format on */ - -#undef FSAD64 -#undef FSAD32 -#undef FSAD64_H -#undef FSAD32_H - -#define FSADAVG64_H(h) \ - unsigned int aom_sad64x##h##_avg_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - ref1_reg = _mm256_avg_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - second_pred += 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - _mm256_zeroupper(); \ - return res; \ - } - -#define FSADAVG32_H(h) \ - unsigned int aom_sad32x##h##_avg_avx2( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - ref1_reg = _mm256_avg_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - second_pred += 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - _mm256_zeroupper(); \ - return res; \ - } - -#define FSADAVG64 \ - FSADAVG64_H(64); \ - FSADAVG64_H(32); - -#define FSADAVG32 \ - FSADAVG32_H(64); \ - FSADAVG32_H(32); \ - FSADAVG32_H(16); - -/* clang-format off */ -FSADAVG64 -FSADAVG32 -/* clang-format on */ - -#undef FSADAVG64 -#undef FSADAVG32 -#undef FSADAVG64_H -#undef FSADAVG32_H diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c deleted file mode 100644 index b506d4663..000000000 --- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c +++ /dev/null @@ -1,1038 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms_avx2.h" -#include "aom_ports/mem.h" - -// SAD -static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { - // input 8 32-bit summation - __m128i lo128, hi128; - __m256i u = _mm256_srli_si256(*v, 8); - u = _mm256_add_epi32(u, *v); - - // 4 32-bit summation - hi128 = _mm256_extracti128_si256(u, 1); - lo128 = _mm256_castsi256_si128(u); - lo128 = _mm_add_epi32(hi128, lo128); - - // 2 32-bit summation - hi128 = _mm_srli_si128(lo128, 4); - lo128 = _mm_add_epi32(lo128, hi128); - - return (unsigned int)_mm_cvtsi128_si32(lo128); -} - -unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); - const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); - - // first 4 rows - __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - __m256i u0 = _mm256_sub_epi16(s0, r0); - __m256i u1 = _mm256_sub_epi16(s1, r1); - __m256i u2 = _mm256_sub_epi16(s2, r2); - __m256i u3 = _mm256_sub_epi16(s3, r3); - __m256i zero = _mm256_setzero_si256(); - __m256i sum0, sum1; - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum0 = _mm256_add_epi16(u0, u1); - sum0 = _mm256_add_epi16(sum0, u2); - sum0 = _mm256_add_epi16(sum0, u3); - - // second 4 rows - src_ptr += src_stride << 2; - ref_ptr += ref_stride << 2; - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - u0 = _mm256_sub_epi16(s0, r0); - u1 = _mm256_sub_epi16(s1, r1); - u2 = _mm256_sub_epi16(s2, r2); - u3 = _mm256_sub_epi16(s3, r3); - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum1 = _mm256_add_epi16(u0, u1); - sum1 = _mm256_add_epi16(sum1, u2); - sum1 = _mm256_add_epi16(sum1, u3); - - // find out the SAD - s0 = _mm256_unpacklo_epi16(sum0, zero); - s1 = _mm256_unpackhi_epi16(sum0, zero); - r0 = _mm256_unpacklo_epi16(sum1, zero); - r1 = _mm256_unpackhi_epi16(sum1, zero); - s0 = _mm256_add_epi32(s0, s1); - r0 = _mm256_add_epi32(r0, r1); - sum0 = _mm256_add_epi32(s0, r0); - // 8 32-bit summation - - return (unsigned int)get_sad_from_mm256_epi32(&sum0); -} - -unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); - const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); - __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3; - __m256i sum0; - __m256i sum = _mm256_setzero_si256(); - const __m256i zero = _mm256_setzero_si256(); - int row = 0; - - // Loop for every 4 rows - while (row < 16) { - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - u0 = _mm256_sub_epi16(s0, r0); - u1 = _mm256_sub_epi16(s1, r1); - u2 = _mm256_sub_epi16(s2, r2); - u3 = _mm256_sub_epi16(s3, r3); - - u0 = _mm256_abs_epi16(u0); - u1 = _mm256_abs_epi16(u1); - u2 = _mm256_abs_epi16(u2); - u3 = _mm256_abs_epi16(u3); - - sum0 = _mm256_add_epi16(u0, u1); - sum0 = _mm256_add_epi16(sum0, u2); - sum0 = _mm256_add_epi16(sum0, u3); - - s0 = _mm256_unpacklo_epi16(sum0, zero); - s1 = _mm256_unpackhi_epi16(sum0, zero); - sum = _mm256_add_epi32(sum, s0); - sum = _mm256_add_epi32(sum, s1); - // 8 32-bit summation - - row += 4; - src_ptr += src_stride << 2; - ref_ptr += ref_stride << 2; - } - return get_sad_from_mm256_epi32(&sum); -} - -static void sad32x4(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s0, s1, s2, s3, r0, r1, r2, r3; - const __m256i zero = _mm256_setzero_si256(); - int row_sections = 0; - - while (row_sections < 2) { - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); - - if (sec_ptr) { - r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); - r1 = _mm256_avg_epu16( - r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r2 = _mm256_avg_epu16( - r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r3 = _mm256_avg_epu16( - r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - } - s0 = _mm256_sub_epi16(s0, r0); - s1 = _mm256_sub_epi16(s1, r1); - s2 = _mm256_sub_epi16(s2, r2); - s3 = _mm256_sub_epi16(s3, r3); - - s0 = _mm256_abs_epi16(s0); - s1 = _mm256_abs_epi16(s1); - s2 = _mm256_abs_epi16(s2); - s3 = _mm256_abs_epi16(s3); - - s0 = _mm256_add_epi16(s0, s1); - s0 = _mm256_add_epi16(s0, s2); - s0 = _mm256_add_epi16(s0, s3); - - r0 = _mm256_unpacklo_epi16(s0, zero); - r1 = _mm256_unpackhi_epi16(s0, zero); - - r0 = _mm256_add_epi32(r0, r1); - *sad_acc = _mm256_add_epi32(*sad_acc, r0); - - row_sections += 1; - src_ptr += src_stride << 1; - ref_ptr += ref_stride << 1; - if (sec_ptr) sec_ptr += 32 << 1; - } -} - -unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - const int left_shift = 2; - int row_section = 0; - - while (row_section < 4) { - sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); - srcp += src_stride << left_shift; - refp += ref_stride << left_shift; - row_section += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 4; - ref += ref_stride << 4; - sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 4; - ref += ref_stride << 4; - sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 5; - ref += ref_stride << 5; - sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -static void sad64x2(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s[8], r[8]; - const __m256i zero = _mm256_setzero_si256(); - - s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); - s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); - s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); - s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); - s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32)); - s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48)); - - r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); - r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); - r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); - r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); - r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32)); - r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48)); - - if (sec_ptr) { - r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); - r[1] = _mm256_avg_epu16( - r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r[2] = _mm256_avg_epu16( - r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r[3] = _mm256_avg_epu16( - r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - r[4] = _mm256_avg_epu16( - r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); - r[5] = _mm256_avg_epu16( - r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); - r[6] = _mm256_avg_epu16( - r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); - r[7] = _mm256_avg_epu16( - r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); - } - - s[0] = _mm256_sub_epi16(s[0], r[0]); - s[1] = _mm256_sub_epi16(s[1], r[1]); - s[2] = _mm256_sub_epi16(s[2], r[2]); - s[3] = _mm256_sub_epi16(s[3], r[3]); - s[4] = _mm256_sub_epi16(s[4], r[4]); - s[5] = _mm256_sub_epi16(s[5], r[5]); - s[6] = _mm256_sub_epi16(s[6], r[6]); - s[7] = _mm256_sub_epi16(s[7], r[7]); - - s[0] = _mm256_abs_epi16(s[0]); - s[1] = _mm256_abs_epi16(s[1]); - s[2] = _mm256_abs_epi16(s[2]); - s[3] = _mm256_abs_epi16(s[3]); - s[4] = _mm256_abs_epi16(s[4]); - s[5] = _mm256_abs_epi16(s[5]); - s[6] = _mm256_abs_epi16(s[6]); - s[7] = _mm256_abs_epi16(s[7]); - - s[0] = _mm256_add_epi16(s[0], s[1]); - s[0] = _mm256_add_epi16(s[0], s[2]); - s[0] = _mm256_add_epi16(s[0], s[3]); - - s[4] = _mm256_add_epi16(s[4], s[5]); - s[4] = _mm256_add_epi16(s[4], s[6]); - s[4] = _mm256_add_epi16(s[4], s[7]); - - r[0] = _mm256_unpacklo_epi16(s[0], zero); - r[1] = _mm256_unpackhi_epi16(s[0], zero); - r[2] = _mm256_unpacklo_epi16(s[4], zero); - r[3] = _mm256_unpackhi_epi16(s[4], zero); - - r[0] = _mm256_add_epi32(r[0], r[1]); - r[0] = _mm256_add_epi32(r[0], r[2]); - r[0] = _mm256_add_epi32(r[0], r[3]); - *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); -} - -unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - const int left_shift = 1; - int row_section = 0; - - while (row_section < 16) { - sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); - srcp += src_stride << left_shift; - refp += ref_stride << left_shift; - row_section += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 5; - ref += ref_stride << 5; - sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, - const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s[8], r[8]; - const __m256i zero = _mm256_setzero_si256(); - - s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); - s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); - s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); - s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); - s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64)); - s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80)); - s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96)); - s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112)); - - r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); - r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); - r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); - r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); - r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64)); - r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80)); - r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96)); - r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112)); - - if (sec_ptr) { - r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); - r[1] = _mm256_avg_epu16( - r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r[2] = _mm256_avg_epu16( - r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r[3] = _mm256_avg_epu16( - r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - r[4] = _mm256_avg_epu16( - r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); - r[5] = _mm256_avg_epu16( - r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); - r[6] = _mm256_avg_epu16( - r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); - r[7] = _mm256_avg_epu16( - r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); - } - - s[0] = _mm256_sub_epi16(s[0], r[0]); - s[1] = _mm256_sub_epi16(s[1], r[1]); - s[2] = _mm256_sub_epi16(s[2], r[2]); - s[3] = _mm256_sub_epi16(s[3], r[3]); - s[4] = _mm256_sub_epi16(s[4], r[4]); - s[5] = _mm256_sub_epi16(s[5], r[5]); - s[6] = _mm256_sub_epi16(s[6], r[6]); - s[7] = _mm256_sub_epi16(s[7], r[7]); - - s[0] = _mm256_abs_epi16(s[0]); - s[1] = _mm256_abs_epi16(s[1]); - s[2] = _mm256_abs_epi16(s[2]); - s[3] = _mm256_abs_epi16(s[3]); - s[4] = _mm256_abs_epi16(s[4]); - s[5] = _mm256_abs_epi16(s[5]); - s[6] = _mm256_abs_epi16(s[6]); - s[7] = _mm256_abs_epi16(s[7]); - - s[0] = _mm256_add_epi16(s[0], s[1]); - s[0] = _mm256_add_epi16(s[0], s[2]); - s[0] = _mm256_add_epi16(s[0], s[3]); - - s[4] = _mm256_add_epi16(s[4], s[5]); - s[4] = _mm256_add_epi16(s[4], s[6]); - s[4] = _mm256_add_epi16(s[4], s[7]); - - r[0] = _mm256_unpacklo_epi16(s[0], zero); - r[1] = _mm256_unpackhi_epi16(s[0], zero); - r[2] = _mm256_unpacklo_epi16(s[4], zero); - r[3] = _mm256_unpackhi_epi16(s[4], zero); - - r[0] = _mm256_add_epi32(r[0], r[1]); - r[0] = _mm256_add_epi32(r[0], r[2]); - r[0] = _mm256_add_epi32(r[0], r[3]); - *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); -} - -unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - int row = 0; - while (row < 64) { - sad128x1(srcp, refp, NULL, &sad); - srcp += src_stride; - refp += ref_stride; - row += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 6; - ref += ref_stride << 6; - sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); - src += src_stride << 6; - ref += ref_stride << 6; - sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); - return sum; -} - -// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. -static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, - const uint16_t *ref_ptr, int ref_stride, - const uint16_t *sec_ptr, __m256i *sad_acc) { - __m256i s0, s1, s2, s3, r0, r1, r2, r3; - const __m256i zero = _mm256_setzero_si256(); - - s0 = _mm256_loadu_si256((const __m256i *)src_ptr); - s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); - s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); - s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); - - r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); - r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); - r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); - r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); - - if (sec_ptr) { - r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); - r1 = _mm256_avg_epu16(r1, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); - r2 = _mm256_avg_epu16(r2, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); - r3 = _mm256_avg_epu16(r3, - _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); - } - - s0 = _mm256_sub_epi16(s0, r0); - s1 = _mm256_sub_epi16(s1, r1); - s2 = _mm256_sub_epi16(s2, r2); - s3 = _mm256_sub_epi16(s3, r3); - - s0 = _mm256_abs_epi16(s0); - s1 = _mm256_abs_epi16(s1); - s2 = _mm256_abs_epi16(s2); - s3 = _mm256_abs_epi16(s3); - - s0 = _mm256_add_epi16(s0, s1); - s0 = _mm256_add_epi16(s0, s2); - s0 = _mm256_add_epi16(s0, s3); - - r0 = _mm256_unpacklo_epi16(s0, zero); - r1 = _mm256_unpackhi_epi16(s0, zero); - - r0 = _mm256_add_epi32(r0, r1); - *sad_acc = _mm256_add_epi32(*sad_acc, r0); -} - -unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); - - sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); - - // Next 4 rows - srcp += src_stride << 2; - refp += ref_stride << 2; - secp += 64; - sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 3; - uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 16 << left_shift; - sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 4; - uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 16 << left_shift; - sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); - const int left_shift = 2; - int row_section = 0; - - while (row_section < 4) { - sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); - srcp += src_stride << left_shift; - refp += ref_stride << left_shift; - secp += 32 << left_shift; - row_section += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 4; - uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 32 << left_shift; - sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 5; - uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 32 << left_shift; - sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); - const int left_shift = 1; - int row_section = 0; - - while (row_section < 16) { - sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); - srcp += src_stride << left_shift; - refp += ref_stride << left_shift; - secp += 64 << left_shift; - row_section += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 5; - uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 64 << left_shift; - sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - const int left_shift = 6; - uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 64 << left_shift; - sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - __m256i sad = _mm256_setzero_si256(); - uint16_t *srcp = CONVERT_TO_SHORTPTR(src); - uint16_t *refp = CONVERT_TO_SHORTPTR(ref); - uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); - int row = 0; - while (row < 64) { - sad128x1(srcp, refp, secp, &sad); - srcp += src_stride; - refp += ref_stride; - secp += 16 << 3; - row += 1; - } - return get_sad_from_mm256_epi32(&sad); -} - -unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - const uint8_t *second_pred) { - unsigned int sum; - const int left_shift = 6; - - sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - src += src_stride << left_shift; - ref += ref_stride << left_shift; - second_pred += 128 << left_shift; - sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, - second_pred); - return sum; -} - -// SAD 4D -// Combine 4 __m256i vectors to uint32_t result[4] -static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, - uint32_t *res) { - __m256i u0, u1, u2, u3; - const __m256i mask = yy_set1_64_from_32i(UINT32_MAX); - __m128i sad; - - // 8 32-bit summation - u0 = _mm256_srli_si256(v[0], 4); - u1 = _mm256_srli_si256(v[1], 4); - u2 = _mm256_srli_si256(v[2], 4); - u3 = _mm256_srli_si256(v[3], 4); - - u0 = _mm256_add_epi32(u0, v[0]); - u1 = _mm256_add_epi32(u1, v[1]); - u2 = _mm256_add_epi32(u2, v[2]); - u3 = _mm256_add_epi32(u3, v[3]); - - u0 = _mm256_and_si256(u0, mask); - u1 = _mm256_and_si256(u1, mask); - u2 = _mm256_and_si256(u2, mask); - u3 = _mm256_and_si256(u3, mask); - // 4 32-bit summation, evenly positioned - - u1 = _mm256_slli_si256(u1, 4); - u3 = _mm256_slli_si256(u3, 4); - - u0 = _mm256_or_si256(u0, u1); - u2 = _mm256_or_si256(u2, u3); - // 8 32-bit summation, interleaved - - u1 = _mm256_unpacklo_epi64(u0, u2); - u3 = _mm256_unpackhi_epi64(u0, u2); - - u0 = _mm256_add_epi32(u1, u3); - sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), - _mm256_castsi256_si128(u0)); - _mm_storeu_si128((__m128i *)res, sad); -} - -static void convert_pointers(const uint8_t *const ref8[], - const uint16_t *ref[]) { - ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); - ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); - ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); - ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); -} - -static void init_sad(__m256i *s) { - s[0] = _mm256_setzero_si256(); - s[1] = _mm256_setzero_si256(); - s[2] = _mm256_setzero_si256(); - s[3] = _mm256_setzero_si256(); -} - -void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - __m256i sad_vec[4]; - const uint16_t *refp[4]; - const uint16_t *keep = CONVERT_TO_SHORTPTR(src); - const uint16_t *srcp; - const int shift_for_4_rows = 2; - int i; - - init_sad(sad_vec); - convert_pointers(ref_array, refp); - - for (i = 0; i < 4; ++i) { - srcp = keep; - sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); - srcp += src_stride << shift_for_4_rows; - refp[i] += ref_stride << shift_for_4_rows; - sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); - } - get_4d_sad_from_mm256_epi32(sad_vec, sad_array); -} - -void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first8rows[4]; - uint32_t second8rows[4]; - const uint8_t *ref[4]; - const int shift_for_8_rows = 3; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows); - src += src_stride << shift_for_8_rows; - ref[0] += ref_stride << shift_for_8_rows; - ref[1] += ref_stride << shift_for_8_rows; - ref[2] += ref_stride << shift_for_8_rows; - ref[3] += ref_stride << shift_for_8_rows; - aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows); - sad_array[0] = first8rows[0] + second8rows[0]; - sad_array[1] = first8rows[1] + second8rows[1]; - sad_array[2] = first8rows[2] + second8rows[2]; - sad_array[3] = first8rows[3] + second8rows[3]; -} - -void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 4; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - __m256i sad_vec[4]; - const uint16_t *refp[4]; - const uint16_t *keep = CONVERT_TO_SHORTPTR(src); - const uint16_t *srcp; - const int shift_for_4_rows = 2; - int i; - int rows_section; - - init_sad(sad_vec); - convert_pointers(ref_array, refp); - - for (i = 0; i < 4; ++i) { - srcp = keep; - rows_section = 0; - while (rows_section < 4) { - sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); - srcp += src_stride << shift_for_4_rows; - refp[i] += ref_stride << shift_for_4_rows; - rows_section++; - } - } - get_4d_sad_from_mm256_epi32(sad_vec, sad_array); -} - -void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 4; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 5; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - __m256i sad_vec[4]; - const uint16_t *refp[4]; - const uint16_t *keep = CONVERT_TO_SHORTPTR(src); - const uint16_t *srcp; - const int shift_for_rows = 1; - int i; - int rows_section; - - init_sad(sad_vec); - convert_pointers(ref_array, refp); - - for (i = 0; i < 4; ++i) { - srcp = keep; - rows_section = 0; - while (rows_section < 16) { - sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); - srcp += src_stride << shift_for_rows; - refp[i] += ref_stride << shift_for_rows; - rows_section++; - } - } - get_4d_sad_from_mm256_epi32(sad_vec, sad_array); -} - -void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 5; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 6; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} - -void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - __m256i sad_vec[4]; - const uint16_t *refp[4]; - const uint16_t *keep = CONVERT_TO_SHORTPTR(src); - const uint16_t *srcp; - int i; - int rows_section; - - init_sad(sad_vec); - convert_pointers(ref_array, refp); - - for (i = 0; i < 4; ++i) { - srcp = keep; - rows_section = 0; - while (rows_section < 64) { - sad128x1(srcp, refp[i], NULL, &sad_vec[i]); - srcp += src_stride; - refp[i] += ref_stride; - rows_section++; - } - } - get_4d_sad_from_mm256_epi32(sad_vec, sad_array); -} - -void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref_array[], - int ref_stride, uint32_t *sad_array) { - uint32_t first_half[4]; - uint32_t second_half[4]; - const uint8_t *ref[4]; - const int shift_for_rows = 6; - - ref[0] = ref_array[0]; - ref[1] = ref_array[1]; - ref[2] = ref_array[2]; - ref[3] = ref_array[3]; - - aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); - src += src_stride << shift_for_rows; - ref[0] += ref_stride << shift_for_rows; - ref[1] += ref_stride << shift_for_rows; - ref[2] += ref_stride << shift_for_rows; - ref[3] += ref_stride << shift_for_rows; - aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); - sad_array[0] = first_half[0] + second_half[0]; - sad_array[1] = first_half[1] + second_half[1]; - sad_array[2] = first_half[2] + second_half[2]; - sad_array[3] = first_half[3] + second_half[3]; -} diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c deleted file mode 100644 index c6fd62c9e..000000000 --- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - __m256i s1, s2, r1, r2; - __m256i sum = _mm256_setzero_si256(); - __m128i sum_i128; - int i; - - for (i = 0; i < 16; ++i) { - r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); - r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); - s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); - s2 = _mm256_sad_epu8( - r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); - sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); - ref_ptr += ref_stride << 1; - src_ptr += src_stride << 1; - } - - sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); - sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), - _mm256_castsi256_si128(sum)); - return _mm_cvtsi128_si32(sum_i128); -} - -static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - unsigned int half_width = 32; - uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); - src_ptr += half_width; - ref_ptr += half_width; - sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); - return sum; -} - -static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); - src_ptr += src_stride << 5; - ref_ptr += ref_stride << 5; - sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); - return sum; -} - -unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - unsigned int half_width = 64; - uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); - src_ptr += half_width; - ref_ptr += half_width; - sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); - return sum; -} - -unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); - src_ptr += src_stride << 6; - ref_ptr += ref_stride << 6; - sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); - return sum; -} - -unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride) { - uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); - src_ptr += src_stride << 6; - ref_ptr += ref_stride << 6; - sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); - return sum; -} - -static void sad64x64x4d(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - __m128i *res) { - uint32_t sum[4]; - aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum); - *res = _mm_loadu_si128((const __m128i *)sum); -} - -void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m128i sum0, sum1; - const uint8_t *rf[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); - src += src_stride << 6; - rf[0] += ref_stride << 6; - rf[1] += ref_stride << 6; - rf[2] += ref_stride << 6; - rf[3] += ref_stride << 6; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); - sum0 = _mm_add_epi32(sum0, sum1); - _mm_storeu_si128((__m128i *)res, sum0); -} - -void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m128i sum0, sum1; - unsigned int half_width = 64; - const uint8_t *rf[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); - src += half_width; - rf[0] += half_width; - rf[1] += half_width; - rf[2] += half_width; - rf[3] += half_width; - sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); - sum0 = _mm_add_epi32(sum0, sum1); - _mm_storeu_si128((__m128i *)res, sum0); -} - -void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - const uint8_t *rf[4]; - uint32_t sum0[4]; - uint32_t sum1[4]; - - rf[0] = ref[0]; - rf[1] = ref[1]; - rf[2] = ref[2]; - rf[3] = ref[3]; - aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0); - src += src_stride << 6; - rf[0] += ref_stride << 6; - rf[1] += ref_stride << 6; - rf[2] += ref_stride << 6; - rf[3] += ref_stride << 6; - aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1); - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - res[2] = sum0[2] + sum1[2]; - res[3] = sum0[3] + sum1[3]; -} - -static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const int h, const uint8_t *second_pred, - const int second_pred_stride) { - int i, res; - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; - __m256i sum_sad = _mm256_setzero_si256(); - __m256i sum_sad_h; - __m128i sum_sad128; - for (i = 0; i < h; i++) { - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); - ref1_reg = _mm256_avg_epu8( - ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); - ref2_reg = _mm256_avg_epu8( - ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); - sad1_reg = - _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); - sad2_reg = _mm256_sad_epu8( - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); - ref_ptr += ref_stride; - src_ptr += src_stride; - second_pred += second_pred_stride; - } - sum_sad_h = _mm256_srli_si256(sum_sad, 8); - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); - res = _mm_cvtsi128_si32(sum_sad128); - - return res; -} - -unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, - second_pred, 64); - src_ptr += src_stride << 6; - ref_ptr += ref_stride << 6; - second_pred += 64 << 6; - sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, - second_pred, 64); - return sum; -} - -unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - unsigned int half_width = 64; - uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, - second_pred, 128); - src_ptr += half_width; - ref_ptr += half_width; - second_pred += half_width; - sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, - second_pred, 128); - return sum; -} - -unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred) { - uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, - ref_stride, second_pred); - src_ptr += src_stride << 6; - ref_ptr += ref_stride << 6; - second_pred += 128 << 6; - sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, - second_pred); - return sum; -} diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm deleted file mode 100644 index 3251b7655..000000000 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ /dev/null @@ -1,353 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro SAD_FN 4 -%if %4 == 0 -%if %3 == 5 -cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows -%else ; %3 == 7 -cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows -%endif ; %3 == 5/7 -%else ; avg -%if %3 == 5 -cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ - second_pred, n_rows -%else ; %3 == 7 -cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ - ref, ref_stride, \ - second_pred, \ - src_stride3, ref_stride3 -%if ARCH_X86_64 -%define n_rowsd r7d -%else ; x86-32 -%define n_rowsd dword r0m -%endif ; x86-32/64 -%endif ; %3 == 5/7 -%endif ; avg/sad - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided -%if %3 == 7 - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] -%endif ; %3 == 7 -%endmacro - -; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD128XN 1-2 0 - SAD_FN 128, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+32] - psadbw m4, [srcq+48] - - paddd m1, m2 - paddd m3, m4 - paddd m0, m1 - paddd m0, m3 - - movu m1, [refq+64] - movu m2, [refq+80] - movu m3, [refq+96] - movu m4, [refq+112] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*4] - pavgb m2, [second_predq+mmsize*5] - pavgb m3, [second_predq+mmsize*6] - pavgb m4, [second_predq+mmsize*7] - lea second_predq, [second_predq+mmsize*8] -%endif - psadbw m1, [srcq+64] - psadbw m2, [srcq+80] - psadbw m3, [srcq+96] - psadbw m4, [srcq+112] - - add refq, ref_strideq - add srcq, src_strideq - - paddd m1, m2 - paddd m3, m4 - paddd m0, m1 - paddd m0, m3 - - sub n_rowsd, 1 - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD128XN 128 ; sad128x128_sse2 -SAD128XN 128, 1 ; sad128x128_avg_sse2 -SAD128XN 64 ; sad128x64_sse2 -SAD128XN 64, 1 ; sad128x64_avg_sse2 - - -; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD64XN 1-2 0 - SAD_FN 64, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+32] - psadbw m4, [srcq+48] - paddd m1, m2 - paddd m3, m4 - add refq, ref_strideq - paddd m0, m1 - add srcq, src_strideq - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD64XN 128 ; sad64x128_sse2 -SAD64XN 128, 1 ; sad64x128_avg_sse2 -SAD64XN 64 ; sad64x64_sse2 -SAD64XN 32 ; sad64x32_sse2 -SAD64XN 64, 1 ; sad64x64_avg_sse2 -SAD64XN 32, 1 ; sad64x32_avg_sse2 -SAD64XN 16 ; sad64x16_sse2 -SAD64XN 16, 1 ; sad64x16_avg_sse2 - -; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD32XN 1-2 0 - SAD_FN 32, %1, 5, %2 - mov n_rowsd, %1/2 - pxor m0, m0 -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+ref_strideq] - movu m4, [refq+ref_strideq+16] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+src_strideq] - psadbw m4, [srcq+src_strideq+16] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD32XN 64 ; sad32x64_sse2 -SAD32XN 32 ; sad32x32_sse2 -SAD32XN 16 ; sad32x16_sse2 -SAD32XN 64, 1 ; sad32x64_avg_sse2 -SAD32XN 32, 1 ; sad32x32_avg_sse2 -SAD32XN 16, 1 ; sad32x16_avg_sse2 -SAD32XN 8 ; sad_32x8_sse2 -SAD32XN 8, 1 ; sad_32x8_avg_sse2 - -; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD16XN 1-2 0 - SAD_FN 16, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movu m1, [refq] - movu m2, [refq+ref_strideq] - movu m3, [refq+ref_strideq*2] - movu m4, [refq+ref_stride3q] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+src_strideq] - psadbw m3, [srcq+src_strideq*2] - psadbw m4, [srcq+src_stride3q] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD16XN 32 ; sad16x32_sse2 -SAD16XN 16 ; sad16x16_sse2 -SAD16XN 8 ; sad16x8_sse2 -SAD16XN 32, 1 ; sad16x32_avg_sse2 -SAD16XN 16, 1 ; sad16x16_avg_sse2 -SAD16XN 8, 1 ; sad16x8_avg_sse2 -SAD16XN 4 ; sad_16x4_sse2 -SAD16XN 4, 1 ; sad_16x4_avg_sse2 -SAD16XN 64 ; sad_16x64_sse2 -SAD16XN 64, 1 ; sad_16x64_avg_sse2 - -; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD8XN 1-2 0 - SAD_FN 8, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movh m1, [refq] - movhps m1, [refq+ref_strideq] - movh m2, [refq+ref_strideq*2] - movhps m2, [refq+ref_stride3q] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - lea second_predq, [second_predq+mmsize*2] -%endif - movh m3, [srcq] - movhps m3, [srcq+src_strideq] - movh m4, [srcq+src_strideq*2] - movhps m4, [srcq+src_stride3q] - psadbw m1, m3 - psadbw m2, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m2 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD8XN 16 ; sad8x16_sse2 -SAD8XN 8 ; sad8x8_sse2 -SAD8XN 4 ; sad8x4_sse2 -SAD8XN 16, 1 ; sad8x16_avg_sse2 -SAD8XN 8, 1 ; sad8x8_avg_sse2 -SAD8XN 4, 1 ; sad8x4_avg_sse2 -SAD8XN 32 ; sad_8x32_sse2 -SAD8XN 32, 1 ; sad_8x32_avg_sse2 - -; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD4XN 1-2 0 - SAD_FN 4, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movd m1, [refq] - movd m2, [refq+ref_strideq] - movd m3, [refq+ref_strideq*2] - movd m4, [refq+ref_stride3q] - punpckldq m1, m2 - punpckldq m3, m4 - movlhps m1, m3 -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - lea second_predq, [second_predq+mmsize*1] -%endif - movd m2, [srcq] - movd m5, [srcq+src_strideq] - movd m4, [srcq+src_strideq*2] - movd m3, [srcq+src_stride3q] - punpckldq m2, m5 - punpckldq m4, m3 - movlhps m2, m4 - psadbw m1, m2 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD4XN 8 ; sad4x8_sse -SAD4XN 4 ; sad4x4_sse -SAD4XN 8, 1 ; sad4x8_avg_sse -SAD4XN 4, 1 ; sad4x4_avg_sse -SAD4XN 16 ; sad_4x16_sse2 -SAD4XN 16, 1 ; sad_4x16_avg_sse2 diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c deleted file mode 100644 index 305dde5c0..000000000 --- a/third_party/aom/aom_dsp/x86/sse_avx2.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <smmintrin.h> -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" - -static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, - const uint8_t *b) { - const __m256i v_a0 = yy_loadu_256(a); - const __m256i v_b0 = yy_loadu_256(b); - const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); - const __m256i v_a01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); - const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); - const __m256i v_b01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); - const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); - const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); - *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); - *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); -} - -static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { - int64_t sum; - const __m256i sum0_4x64 = - _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); - const __m256i sum1_4x64 = - _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); - const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); - const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), - _mm256_extracti128_si256(sum_4x64, 1)); - const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); - - xx_storel_64(&sum, sum_1x64); - return sum; -} - -int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int32_t y = 0; - int64_t sse = 0; - __m256i sum = _mm256_setzero_si256(); - switch (width) { - case 4: - do { - const __m128i v_a0 = xx_loadl_32(a); - const __m128i v_a1 = xx_loadl_32(a + a_stride); - const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); - const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); - const __m128i v_b0 = xx_loadl_32(b); - const __m128i v_b1 = xx_loadl_32(b + b_stride); - const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); - const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); - const __m128i v_a0123 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); - const __m128i v_b0123 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); - const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); - const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 2; - b += b_stride << 2; - y += 4; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 8: - do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m256i v_a_w = - _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); - const __m256i v_b_w = - _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 1; - b += b_stride << 1; - y += 2; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 16: - do { - const __m128i v_a0 = xx_loadu_128(a); - const __m128i v_b0 = xx_loadu_128(b); - const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); - const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 32: - do { - sse_w32_avx2(&sum, a, b); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 64: - do { - sse_w32_avx2(&sum, a, b); - sse_w32_avx2(&sum, a + 32, b + 32); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 128: - do { - sse_w32_avx2(&sum, a, b); - sse_w32_avx2(&sum, a + 32, b + 32); - sse_w32_avx2(&sum, a + 64, b + 64); - sse_w32_avx2(&sum, a + 96, b + 96); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - default: break; - } - - return sse; -} - -static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, - const uint16_t *b) { - const __m256i v_a_w = yy_loadu_256(a); - const __m256i v_b_w = yy_loadu_256(b); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); -} - -int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { - int32_t y = 0; - int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - __m256i sum = _mm256_setzero_si256(); - switch (width) { - case 4: - do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); - const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); - const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); - const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), - _mm_unpacklo_epi64(v_a2, v_a3)); - const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), - _mm_unpacklo_epi64(v_b2, v_b3)); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 2; - b += b_stride << 2; - y += 4; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 8: - do { - const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); - const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 1; - b += b_stride << 1; - y += 2; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 16: - do { - highbd_sse_w16_avx2(&sum, a, b); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 32: - do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16, b + 16); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 64: - do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - case 128: - do { - highbd_sse_w16_avx2(&sum, a, b); - highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); - highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); - highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); - highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); - highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); - highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); - highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_avx2(&sum); - break; - default: break; - } - return sse; -} diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c deleted file mode 100644 index 8b5af8469..000000000 --- a/third_party/aom/aom_dsp/x86/sse_sse4.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/aom_config.h" - -#include "aom_ports/mem.h" -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms.h" - -static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { - int64_t sum; - const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); - const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); - const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); - const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); - xx_storel_64(&sum, sum_1x64); - return sum; -} - -static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, - const uint8_t *b) { - const __m128i v_a0 = xx_loadu_128(a); - const __m128i v_b0 = xx_loadu_128(b); - const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); - const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); - const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); - const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); - const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); - const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); - *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); - *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); -} - -int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { - int y = 0; - int64_t sse = 0; - __m128i sum = _mm_setzero_si128(); - switch (width) { - case 4: - do { - const __m128i v_a0 = xx_loadl_32(a); - const __m128i v_a1 = xx_loadl_32(a + a_stride); - const __m128i v_b0 = xx_loadl_32(b); - const __m128i v_b1 = xx_loadl_32(b + b_stride); - const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); - const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 1; - b += b_stride << 1; - y += 2; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 8: - do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); - const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 16: - do { - sse_w16_sse4_1(&sum, a, b); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 32: - do { - sse_w16_sse4_1(&sum, a, b); - sse_w16_sse4_1(&sum, a + 16, b + 16); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 64: - do { - sse_w16_sse4_1(&sum, a, b); - sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); - sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); - sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 128: - do { - sse_w16_sse4_1(&sum, a, b); - sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); - sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); - sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); - sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); - sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); - sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); - sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - default: break; - } - - return sse; -} - -static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, - const uint16_t *b) { - const __m128i v_a_w = xx_loadu_128(a); - const __m128i v_b_w = xx_loadu_128(b); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); -} - -int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int width, - int height) { - int32_t y = 0; - int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - __m128i sum = _mm_setzero_si128(); - switch (width) { - case 4: - do { - const __m128i v_a0 = xx_loadl_64(a); - const __m128i v_a1 = xx_loadl_64(a + a_stride); - const __m128i v_b0 = xx_loadl_64(b); - const __m128i v_b1 = xx_loadl_64(b + b_stride); - const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); - const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); - a += a_stride << 1; - b += b_stride << 1; - y += 2; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 8: - do { - highbd_sse_w8_sse4_1(&sum, a, b); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 16: - do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 32: - do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 64: - do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - case 128: - do { - highbd_sse_w8_sse4_1(&sum, a, b); - highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); - highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); - highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); - highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); - highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); - highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); - highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); - highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); - highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); - highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); - highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); - highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); - highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); - highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); - highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); - a += a_stride; - b += b_stride; - y += 1; - } while (y < height); - sse = summary_all_sse4(&sum); - break; - default: break; - } - return sse; -} diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm deleted file mode 100644 index 6d9b5a12f..000000000 --- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm +++ /dev/null @@ -1,222 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr -%macro TABULATE_SSIM 0 - paddusw xmm15, xmm3 ; sum_s - paddusw xmm14, xmm4 ; sum_r - movdqa xmm1, xmm3 - pmaddwd xmm1, xmm1 - paddd xmm13, xmm1 ; sum_sq_s - movdqa xmm2, xmm4 - pmaddwd xmm2, xmm2 - paddd xmm12, xmm2 ; sum_sq_r - pmaddwd xmm3, xmm4 - paddd xmm11, xmm3 ; sum_sxr -%endmacro - -; Sum across the register %1 starting with q words -%macro SUM_ACROSS_Q 1 - movdqa xmm2,%1 - punpckldq %1,xmm0 - punpckhdq xmm2,xmm0 - paddq %1,xmm2 - movdqa xmm2,%1 - punpcklqdq %1,xmm0 - punpckhqdq xmm2,xmm0 - paddq %1,xmm2 -%endmacro - -; Sum across the register %1 starting with q words -%macro SUM_ACROSS_W 1 - movdqa xmm1, %1 - punpcklwd %1,xmm0 - punpckhwd xmm1,xmm0 - paddd %1, xmm1 - SUM_ACROSS_Q %1 -%endmacro - -SECTION .text - -;void ssim_parms_sse2( -; unsigned char *s, -; int sp, -; unsigned char *r, -; int rp -; uint32_t *sum_s, -; uint32_t *sum_r, -; uint32_t *sum_sq_s, -; uint32_t *sum_sq_r, -; uint32_t *sum_sxr); -; -; TODO: Use parm passing through structure, probably don't need the pxors -; ( calling app will initialize to 0 ) could easily fit everything in sse2 -; without too much hastle, and can probably do better estimates with psadw -; or pavgb At this point this is just meant to be first pass for calculating -; all the parms needed for 16x16 ssim so we can play with dssim as distortion -; in mode selection code. -global sym(aom_ssim_parms_16x16_sse2) PRIVATE -sym(aom_ssim_parms_16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 15 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rcx, arg(1) ;sp - mov rdi, arg(2) ;r - mov rax, arg(3) ;rp - - pxor xmm0, xmm0 - pxor xmm15,xmm15 ;sum_s - pxor xmm14,xmm14 ;sum_r - pxor xmm13,xmm13 ;sum_sq_s - pxor xmm12,xmm12 ;sum_sq_r - pxor xmm11,xmm11 ;sum_sxr - - mov rdx, 16 ;row counter -.NextRow: - - ;grab source and reference pixels - movdqu xmm5, [rsi] - movdqu xmm6, [rdi] - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 - punpckhbw xmm3, xmm0 ; high_s - punpckhbw xmm4, xmm0 ; high_r - - TABULATE_SSIM - - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 - punpcklbw xmm3, xmm0 ; low_s - punpcklbw xmm4, xmm0 ; low_r - - TABULATE_SSIM - - add rsi, rcx ; next s row - add rdi, rax ; next r row - - dec rdx ; counter - jnz .NextRow - - SUM_ACROSS_W xmm15 - SUM_ACROSS_W xmm14 - SUM_ACROSS_Q xmm13 - SUM_ACROSS_Q xmm12 - SUM_ACROSS_Q xmm11 - - mov rdi,arg(4) - movd [rdi], xmm15; - mov rdi,arg(5) - movd [rdi], xmm14; - mov rdi,arg(6) - movd [rdi], xmm13; - mov rdi,arg(7) - movd [rdi], xmm12; - mov rdi,arg(8) - movd [rdi], xmm11; - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void ssim_parms_sse2( -; unsigned char *s, -; int sp, -; unsigned char *r, -; int rp -; uint32_t *sum_s, -; uint32_t *sum_r, -; uint32_t *sum_sq_s, -; uint32_t *sum_sq_r, -; uint32_t *sum_sxr); -; -; TODO: Use parm passing through structure, probably don't need the pxors -; ( calling app will initialize to 0 ) could easily fit everything in sse2 -; without too much hastle, and can probably do better estimates with psadw -; or pavgb At this point this is just meant to be first pass for calculating -; all the parms needed for 16x16 ssim so we can play with dssim as distortion -; in mode selection code. -global sym(aom_ssim_parms_8x8_sse2) PRIVATE -sym(aom_ssim_parms_8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 15 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rcx, arg(1) ;sp - mov rdi, arg(2) ;r - mov rax, arg(3) ;rp - - pxor xmm0, xmm0 - pxor xmm15,xmm15 ;sum_s - pxor xmm14,xmm14 ;sum_r - pxor xmm13,xmm13 ;sum_sq_s - pxor xmm12,xmm12 ;sum_sq_r - pxor xmm11,xmm11 ;sum_sxr - - mov rdx, 8 ;row counter -.NextRow: - - ;grab source and reference pixels - movq xmm3, [rsi] - movq xmm4, [rdi] - punpcklbw xmm3, xmm0 ; low_s - punpcklbw xmm4, xmm0 ; low_r - - TABULATE_SSIM - - add rsi, rcx ; next s row - add rdi, rax ; next r row - - dec rdx ; counter - jnz .NextRow - - SUM_ACROSS_W xmm15 - SUM_ACROSS_W xmm14 - SUM_ACROSS_Q xmm13 - SUM_ACROSS_Q xmm12 - SUM_ACROSS_Q xmm11 - - mov rdi,arg(4) - movd [rdi], xmm15; - mov rdi,arg(5) - movd [rdi], xmm14; - mov rdi,arg(6) - movd [rdi], xmm13; - mov rdi,arg(7) - movd [rdi], xmm12; - mov rdi,arg(8) - movd [rdi], xmm11; - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm deleted file mode 100644 index 45bf6ec3c..000000000 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ /dev/null @@ -1,1481 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 - -bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 14, 2 - times 8 db 12, 4 - times 8 db 10, 6 - times 16 db 8 - times 8 db 6, 10 - times 8 db 4, 12 - times 8 db 2, 14 - -SECTION .text - -; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, -; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, -; int height, unsigned int *sse); -; -; This function returns the SE and stores SSE in the given pointer. - -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse - psubw %3, %4 - psubw %1, %2 - paddw %5, %3 - pmaddwd %3, %3 - paddw %5, %1 - pmaddwd %1, %1 - paddd %6, %3 - paddd %6, %1 -%endmacro - -%macro STORE_AND_RET 1 -%if %1 > 4 - ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit - ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. - ; We have to sign-extend it before adding the words within the register - ; and outputing to a dword. - pcmpgtw m5, m6 ; mask for 0 > x - movhlps m3, m7 - punpcklwd m4, m6, m5 - punpckhwd m6, m5 ; sign-extend m6 word->dword - paddd m7, m3 - paddd m6, m4 - pshufd m3, m7, 0x1 - movhlps m4, m6 - paddd m7, m3 - paddd m6, m4 - mov r1, ssem ; r1 = unsigned int *sse - pshufd m4, m6, 0x1 - movd [r1], m7 ; store sse - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%else ; 4xh - pshuflw m4, m6, 0xe - pshuflw m3, m7, 0xe - paddw m6, m4 - paddd m7, m3 - pcmpgtw m5, m6 ; mask for 0 > x - mov r1, ssem ; r1 = unsigned int *sse - punpcklwd m6, m5 ; sign-extend m6 word->dword - movd [r1], m7 ; store sse - pshuflw m4, m6, 0xe - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%endif - RET -%endmacro - -%macro INC_SRC_BY_SRC_STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - add srcq, src_stridemp -%else - add srcq, src_strideq -%endif -%endmacro - -%macro SUBPEL_VARIANCE 1-2 0 ; W -%if cpuflag(ssse3) -%define bilin_filter_m bilin_filter_m_ssse3 -%define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 -%endif -; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses -; 11, not 13, if the registers are ordered correctly. May make a minor speed -; difference on Win64 - -%if ARCH_X86_64 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq - %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %endif - %define block_height heightd - %define bilin_filter sseq -%else - %if CONFIG_PIC=1 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse, \ - g_bilin_filter, g_pw_8 - %define block_height dword heightm - %define sec_str sec_stridemp - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse, g_bilin_filter, g_pw_8 - %define block_height heightd - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %endif - %else - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, sec, sec_stride, \ - height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - %define bilin_filter bilin_filter_m - %endif -%endif - -%if %1 == 4 - %define movx movd -%else - %define movx movh -%endif - - ASSERT %1 <= 16 ; m6 overflows if w > 16 - pxor m6, m6 ; sum - pxor m7, m7 ; sse - ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we - ; could perhaps use it for something more productive then - pxor m5, m5 ; dedicated zero register -%if %1 < 16 - sar block_height, 1 -%if %2 == 1 ; avg - shl sec_str, 1 -%endif -%endif - - ; FIXME(rbultje) replace by jumptable? - test x_offsetd, x_offsetd - jnz .x_nonzero - ; x_offset == 0 - test y_offsetd, y_offsetd - jnz .x_zero_y_nonzero - - ; x_offset == 0 && y_offset == 0 -.x_zero_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - mova m1, [dstq] -%if %2 == 1 ; avg - pavgb m0, [secq] - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - -%if %2 == 0 ; !avg - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] -%endif - - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - -%if %2 == 1 ; avg -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_zero_loop - STORE_AND_RET %1 - -.x_zero_y_nonzero: - cmp y_offsetd, 4 - jne .x_zero_y_nonhalf - - ; x_offset == 0 && y_offset == 0.5 -.x_zero_y_half_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq*2] -%else ; 4xh - movx m1, [srcq+src_strideq*2] - punpckldq m2, m1 -%endif - movx m1, [dstq] -%if %1 > 4 - movlhps m0, m2 -%else ; 4xh - punpckldq m0, m2 -%endif - movx m3, [dstq+dst_strideq] - pavgb m0, m2 - punpcklbw m1, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m3, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m4, [secq] - pavgb m0, m4 - punpcklbw m3, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq*2] - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_half_loop - STORE_AND_RET %1 - -.x_zero_y_nonhalf: - ; x_offset == 0 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86-32 or mmx -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0, reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_zero_y_other_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can - ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of - ; instructions is the same (5), but it is 1 mul instead of 2, so might be - ; slightly faster because of pmullw latency. It would also cut our rodata - ; tables in half for this function, and save 1-2 registers on x86-64. - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m4, filter_y_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonzero: - cmp x_offsetd, 4 - jne .x_nonhalf - ; x_offset == 0.5 - test y_offsetd, y_offsetd - jnz .x_half_y_nonzero - - ; x_offset == 0.5 && y_offset == 0 -.x_half_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m4, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] - movhps m4, [srcq+src_strideq+1] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 - movx m2, [srcq+src_strideq+1] - punpckldq m4, m2 -%endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - pavgb m0, m4 - punpcklbw m3, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m1, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] - movx m1, [dstq] - pavgb m0, m4 - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_zero_loop - STORE_AND_RET %1 - -.x_half_y_nonzero: - cmp y_offsetd, 4 - jne .x_half_y_nonhalf - - ; x_offset == 0.5 && y_offset == 0.5 -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - pavgb m4, m3 - punpckhbw m3, m1, m5 - pavgb m0, m4 -%if %2 == 1 ; avg - punpcklbw m1, m5 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movx m2, [srcq] - movx m3, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq] - movhps m3, [srcq+src_strideq+1] -%else - movx m1, [srcq+src_strideq] - punpckldq m2, m1 - movx m1, [srcq+src_strideq+1] - punpckldq m3, m1 -%endif - pavgb m2, m3 -%if %1 > 4 - movlhps m0, m2 - movhlps m4, m2 -%else ; 4xh - punpckldq m0, m2 - pshuflw m4, m2, 0xe -%endif - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq] - movx m1, [srcq+src_strideq+1] - pavgb m2, m3 - pavgb m4, m1 - pavgb m0, m2 - pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_half_loop - STORE_AND_RET %1 - -.x_half_y_nonhalf: - ; x_offset == 0.5 && y_offset == bilin interpolation -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ;x86_32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0.5. We can reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_other_loop: - movu m4, [srcq] - movu m2, [srcq+1] - mova m1, [dstq] - pavgb m4, m2 -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - punpcklbw m0, m5 - paddw m2, m3 - punpcklbw m3, m4, m5 - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 -%endif - punpckhbw m3, m1, m5 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -%if notcpuflag(ssse3) - punpcklbw m0, m5 -%endif -.x_half_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] - pavgb m2, m1 - pavgb m4, m3 - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - paddw m0, m1 - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m2, m1 - movx m1, [dstq] -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf: - test y_offsetd, y_offsetd - jnz .x_nonhalf_y_nonzero - - ; x_offset == bilin interpolation && y_offset == 0 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -;y_offset == 0. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_other_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - pmullw m0, filter_x_a - pmullw m4, filter_x_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - punpcklbw m0, m1 - movx m1, [dstq] - punpcklbw m2, m4 - pmaddubsw m0, filter_x_a - pmaddubsw m2, filter_x_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_zero_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonzero: - cmp y_offsetd, 4 - jne .x_nonhalf_y_nonhalf - - ; x_offset == bilin interpolation && y_offset == 0.5 -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0.5. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - add srcq, src_strideq - packuswb m0, m2 -.x_other_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] -%if cpuflag(ssse3) - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - pavgb m0, m4 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%else - punpckhbw m2, m4, m5 - punpckhbw m1, m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - paddw m4, m3 - paddw m2, m1 - mova m1, [dstq] - psraw m4, 4 - psraw m2, 4 - punpckhbw m3, m1, m5 - ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we - ; have a 1-register shortage to be able to store the backup of the bilin - ; filtered second line as words as cache for the next line. Packing into - ; a byte costs 1 pack and 2 unpacks, but saves a register. - packuswb m4, m2 - punpcklbw m1, m5 - pavgb m0, m4 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - pavgb m0, [secq] -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - add srcq, src_strideq - psraw m0, 4 -.x_other_y_half_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - paddw m2, filter_rnd - paddw m4, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - movx m1, [dstq] - paddw m4, m3 - movx m3, [dstq+dst_strideq] -%endif - psraw m2, 4 - psraw m4, 4 - pavgw m0, m2 - pavgw m2, m4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - also consider going to bytes here -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_half_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonhalf: -%if ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift - shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m11, [bilin_filter+y_offsetq+16] -%endif - mova m12, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_y_a m10 -%define filter_y_b m11 -%define filter_rnd m12 -%else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 -; In this case, there is NO unused register. Used src_stride register. Later, -; src_stride has to be loaded from stack when it is needed. -%define tempq src_strideq - mov tempq, g_bilin_filterm - add x_offsetq, tempq - add y_offsetq, tempq -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter - add y_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - - ; x_offset == bilin interpolation && y_offset == bilin interpolation -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - - INC_SRC_BY_SRC_STRIDE - - packuswb m0, m2 -.x_other_y_other_loop: -%if cpuflag(ssse3) - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - punpckhbw m3, m1, m5 - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - punpcklbw m1, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 - psraw m0, 4 -%else - movu m3, [srcq] - movu m4, [srcq+1] - punpckhbw m1, m3, m5 - punpckhbw m2, m4, m5 - punpcklbw m3, m5 - punpcklbw m4, m5 - pmullw m3, filter_x_a - pmullw m4, filter_x_b - paddw m3, filter_rnd - pmullw m1, filter_x_a - pmullw m2, filter_x_b - paddw m1, filter_rnd - paddw m3, m4 - paddw m1, m2 - psraw m3, 4 - psraw m1, 4 - packuswb m4, m3, m1 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - pmullw m2, filter_y_a - pmullw m1, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m2, m1 - mova m1, [dstq] - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 - punpckhbw m3, m1, m5 - psraw m0, 4 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - psraw m0, 4 -%if cpuflag(ssse3) - packuswb m0, m0 -%endif - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - - INC_SRC_BY_SRC_STRIDE - movx m4, [srcq] - movx m3, [srcq+1] - -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m2, m2 - packuswb m4, m4 - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd - psraw m0, 4 - psraw m2, 4 - punpcklbw m1, m5 -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - paddw m4, m3 - psraw m2, 4 - psraw m4, 4 - pmullw m0, filter_y_a - pmullw m3, m2, filter_y_b - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m0, m3 - movx m3, [dstq+dst_strideq] - paddw m2, m1 - movx m1, [dstq] - psraw m0, 4 - psraw m2, 4 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_other_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd -%undef movx - STORE_AND_RET %1 -%endmacro - -; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical -; between the ssse3 and non-ssse3 version. It may make sense to merge their -; code in the sense that the ssse3 version would jump to the appropriate -; location in the sse/2 version, rather than duplicating that code in the -; binary. - -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c deleted file mode 100644 index 4389d123d..000000000 --- a/third_party/aom/aom_dsp/x86/subtract_avx2.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, - const uint8_t *pred_ptr) { - __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); - __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); - __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); - __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); - __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); - __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); - const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); - const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); - _mm256_store_si256((__m256i *)(diff_ptr), d_0); - _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); -} - -static INLINE void aom_subtract_block_16xn_avx2( - int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { - for (int32_t j = 0; j < rows; ++j) { - __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); - __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); - __m256i s_0 = _mm256_cvtepu8_epi16(s); - __m256i p_0 = _mm256_cvtepu8_epi16(p); - const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); - _mm256_store_si256((__m256i *)(diff_ptr), d_0); - src_ptr += src_stride; - pred_ptr += pred_stride; - diff_ptr += diff_stride; - } -} - -static INLINE void aom_subtract_block_32xn_avx2( - int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { - for (int32_t j = 0; j < rows; ++j) { - subtract32_avx2(diff_ptr, src_ptr, pred_ptr); - src_ptr += src_stride; - pred_ptr += pred_stride; - diff_ptr += diff_stride; - } -} - -static INLINE void aom_subtract_block_64xn_avx2( - int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { - for (int32_t j = 0; j < rows; ++j) { - subtract32_avx2(diff_ptr, src_ptr, pred_ptr); - subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); - src_ptr += src_stride; - pred_ptr += pred_stride; - diff_ptr += diff_stride; - } -} - -static INLINE void aom_subtract_block_128xn_avx2( - int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { - for (int32_t j = 0; j < rows; ++j) { - subtract32_avx2(diff_ptr, src_ptr, pred_ptr); - subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); - subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); - subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); - src_ptr += src_stride; - pred_ptr += pred_stride; - diff_ptr += diff_stride; - } -} - -void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, - ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, - ptrdiff_t pred_stride) { - switch (cols) { - case 16: - aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - case 32: - aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - case 64: - aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - case 128: - aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - default: - aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - } -} diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm deleted file mode 100644 index 1a75a234f..000000000 --- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm +++ /dev/null @@ -1,146 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; void aom_subtract_block(int rows, int cols, -; int16_t *diff, ptrdiff_t diff_stride, -; const uint8_t *src, ptrdiff_t src_stride, -; const uint8_t *pred, ptrdiff_t pred_stride) - -INIT_XMM sse2 -cglobal subtract_block, 7, 7, 8, \ - rows, cols, diff, diff_stride, src, src_stride, \ - pred, pred_stride -%define pred_str colsq - pxor m7, m7 ; dedicated zero register - cmp colsd, 4 - je .case_4 - cmp colsd, 8 - je .case_8 - cmp colsd, 16 - je .case_16 - cmp colsd, 32 - je .case_32 - cmp colsd, 64 - je .case_64 - -%macro loop16 6 - mova m0, [srcq+%1] - mova m4, [srcq+%2] - mova m1, [predq+%3] - mova m5, [predq+%4] - punpckhbw m2, m0, m7 - punpckhbw m3, m1, m7 - punpcklbw m0, m7 - punpcklbw m1, m7 - psubw m2, m3 - psubw m0, m1 - punpckhbw m1, m4, m7 - punpckhbw m3, m5, m7 - punpcklbw m4, m7 - punpcklbw m5, m7 - psubw m1, m3 - psubw m4, m5 - mova [diffq+mmsize*0+%5], m0 - mova [diffq+mmsize*1+%5], m2 - mova [diffq+mmsize*0+%6], m4 - mova [diffq+mmsize*1+%6], m1 -%endmacro - - mov pred_str, pred_stridemp -.loop_128: - loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize - loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize - loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize - loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize - lea diffq, [diffq+diff_strideq*2] - add predq, pred_str - add srcq, src_strideq - sub rowsd, 1 - jnz .loop_128 - RET - -.case_64: - mov pred_str, pred_stridemp -.loop_64: - loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize - loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize - lea diffq, [diffq+diff_strideq*2] - add predq, pred_str - add srcq, src_strideq - dec rowsd - jg .loop_64 - RET - -.case_32: - mov pred_str, pred_stridemp -.loop_32: - loop16 0, mmsize, 0, mmsize, 0, 2*mmsize - lea diffq, [diffq+diff_strideq*2] - add predq, pred_str - add srcq, src_strideq - dec rowsd - jg .loop_32 - RET - -.case_16: - mov pred_str, pred_stridemp -.loop_16: - loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 - lea diffq, [diffq+diff_strideq*4] - lea predq, [predq+pred_str*2] - lea srcq, [srcq+src_strideq*2] - sub rowsd, 2 - jg .loop_16 - RET - -%macro loop_h 0 - movh m0, [srcq] - movh m2, [srcq+src_strideq] - movh m1, [predq] - movh m3, [predq+pred_str] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - psubw m0, m1 - psubw m2, m3 - mova [diffq], m0 - mova [diffq+diff_strideq*2], m2 -%endmacro - -.case_8: - mov pred_str, pred_stridemp -.loop_8: - loop_h - lea diffq, [diffq+diff_strideq*4] - lea srcq, [srcq+src_strideq*2] - lea predq, [predq+pred_str*2] - sub rowsd, 2 - jg .loop_8 - RET - -INIT_MMX -.case_4: - mov pred_str, pred_stridemp -.loop_4: - loop_h - lea diffq, [diffq+diff_strideq*4] - lea srcq, [srcq+src_strideq*2] - lea predq, [predq+pred_str*2] - sub rowsd, 2 - jg .loop_4 - RET diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c deleted file mode 100644 index 0af44e3a4..000000000 --- a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> -#include <smmintrin.h> - -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" -#include "aom_dsp/x86/sum_squares_sse2.h" -#include "config/aom_dsp_rtcd.h" - -static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, - int width, int height) { - uint64_t result; - __m256i v_acc_q = _mm256_setzero_si256(); - const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); - for (int col = 0; col < height; col += 4) { - __m256i v_acc_d = _mm256_setzero_si256(); - for (int row = 0; row < width; row += 16) { - const int16_t *tempsrc = src + row; - const __m256i v_val_0_w = - _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); - const __m256i v_val_1_w = - _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); - const __m256i v_val_2_w = - _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); - const __m256i v_val_3_w = - _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); - - const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); - const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); - const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); - const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); - - const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); - const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); - const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); - - v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); - } - v_acc_q = - _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); - v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); - src += 4 * stride; - } - __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); - __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); - __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); - - result_64_2_int = _mm_add_epi64( - result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); - - xx_storel_64(&result, result_64_2_int); - - return result; -} - -uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, - int height) { - if (LIKELY(width == 4 && height == 4)) { - return aom_sum_squares_2d_i16_4x4_sse2(src, stride); - } else if (LIKELY(width == 4 && (height & 3) == 0)) { - return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); - } else if (LIKELY(width == 8 && (height & 3) == 0)) { - return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); - } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { - return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); - } else { - return aom_sum_squares_2d_i16_c(src, stride, width, height); - } -} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c deleted file mode 100644 index 22d7739ec..000000000 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> -#include <stdio.h> - -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/sum_squares_sse2.h" -#include "config/aom_dsp_rtcd.h" - -static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { - const __m128d ad = _mm_castsi128_pd(a); - return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); -} - -static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(a); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, a); - return tmp; - } -#endif -} - -static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { - const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); - const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); - const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); - const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); - const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); - const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); - - return _mm_add_epi32(v_sq_01_d, v_sq_23_d); -} - -uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { - const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); - __m128i v_sum_d = - _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); - return (uint64_t)_mm_cvtsi128_si32(v_sum_d); -} - -uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, - int height) { - int r = 0; - __m128i v_acc_q = _mm_setzero_si128(); - do { - const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); - v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); - src += stride << 2; - r += 4; - } while (r < height); - const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); - __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), - _mm_and_si128(v_acc_q, v_zext_mask_q)); - v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); - return xx_cvtsi128_si64(v_acc_64); -} - -#ifdef __GNUC__ -// This prevents GCC/Clang from inlining this function into -// aom_sum_squares_2d_i16_sse2, which in turn saves some stack -// maintenance instructions in the common case of 4x4. -__attribute__((noinline)) -#endif -uint64_t -aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, - int height) { - int r = 0; - - const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); - __m128i v_acc_q = _mm_setzero_si128(); - - do { - __m128i v_acc_d = _mm_setzero_si128(); - int c = 0; - do { - const int16_t *b = src + c; - - const __m128i v_val_0_w = xx_load_128(b + 0 * stride); - const __m128i v_val_1_w = xx_load_128(b + 1 * stride); - const __m128i v_val_2_w = xx_load_128(b + 2 * stride); - const __m128i v_val_3_w = xx_load_128(b + 3 * stride); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - c += 8; - } while (c < width); - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - - src += 4 * stride; - r += 4; - } while (r < height); - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); - return xx_cvtsi128_si64(v_acc_q); -} - -uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, - int height) { - // 4 elements per row only requires half an XMM register, so this - // must be a special case, but also note that over 75% of all calls - // are with size == 4, so it is also the common case. - if (LIKELY(width == 4 && height == 4)) { - return aom_sum_squares_2d_i16_4x4_sse2(src, stride); - } else if (LIKELY(width == 4 && (height & 3) == 0)) { - return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); - } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { - // Generic case - return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); - } else { - return aom_sum_squares_2d_i16_c(src, stride, width, height); - } -} - -////////////////////////////////////////////////////////////////////////////// -// 1D version -////////////////////////////////////////////////////////////////////////////// - -static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { - const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); - __m128i v_acc0_q = _mm_setzero_si128(); - __m128i v_acc1_q = _mm_setzero_si128(); - - const int16_t *const end = src + n; - - assert(n % 64 == 0); - - while (src < end) { - const __m128i v_val_0_w = xx_load_128(src); - const __m128i v_val_1_w = xx_load_128(src + 8); - const __m128i v_val_2_w = xx_load_128(src + 16); - const __m128i v_val_3_w = xx_load_128(src + 24); - const __m128i v_val_4_w = xx_load_128(src + 32); - const __m128i v_val_5_w = xx_load_128(src + 40); - const __m128i v_val_6_w = xx_load_128(src + 48); - const __m128i v_val_7_w = xx_load_128(src + 56); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); - - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); - - const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); - - v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); - v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); - - src += 64; - } - - v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); - v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); - return xx_cvtsi128_si64(v_acc0_q); -} - -uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { - if (n % 64 == 0) { - return aom_sum_squares_i16_64n_sse2(src, n); - } else if (n > 64) { - int k = n & ~(64 - 1); - return aom_sum_squares_i16_64n_sse2(src, k) + - aom_sum_squares_i16_c(src + k, n - k); - } else { - return aom_sum_squares_i16_c(src, n); - } -} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h deleted file mode 100644 index 491e31cc5..000000000 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ -#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ - -uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, - int width, int height); - -uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, - int height); -uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); - -#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h deleted file mode 100644 index 1e9f1e27b..000000000 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ -#define AOM_AOM_DSP_X86_SYNONYMS_H_ - -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -/** - * Various reusable shorthands for x86 SIMD intrinsics. - * - * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. - * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. - */ - -// Loads and stores to do away with the tedium of casting the address -// to the right type. -static INLINE __m128i xx_loadl_32(const void *a) { - return _mm_cvtsi32_si128(*(const uint32_t *)a); -} - -static INLINE __m128i xx_loadl_64(const void *a) { - return _mm_loadl_epi64((const __m128i *)a); -} - -static INLINE __m128i xx_load_128(const void *a) { - return _mm_load_si128((const __m128i *)a); -} - -static INLINE __m128i xx_loadu_128(const void *a) { - return _mm_loadu_si128((const __m128i *)a); -} - -static INLINE void xx_storel_32(void *const a, const __m128i v) { - *(uint32_t *)a = _mm_cvtsi128_si32(v); -} - -static INLINE void xx_storel_64(void *const a, const __m128i v) { - _mm_storel_epi64((__m128i *)a, v); -} - -static INLINE void xx_store_128(void *const a, const __m128i v) { - _mm_store_si128((__m128i *)a, v); -} - -static INLINE void xx_storeu_128(void *const a, const __m128i v) { - _mm_storeu_si128((__m128i *)a, v); -} - -// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm_set_epi64x() -// acting on 32-bit integers. -static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { -#if defined(_MSC_VER) && _MSC_VER < 1900 - return _mm_set_epi32(0, e1, 0, e0); -#else - return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); -#endif -} - -// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm_set1_epi64x() -// acting on a 32-bit integer. -static INLINE __m128i xx_set1_64_from_32i(int32_t a) { -#if defined(_MSC_VER) && _MSC_VER < 1900 - return _mm_set_epi32(0, a, 0, a); -#else - return _mm_set1_epi64x((uint32_t)a); -#endif -} - -static INLINE __m128i xx_round_epu16(__m128i v_val_w) { - return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); -} - -static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { - const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); - return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); -} - -static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srli_epi32(v_tmp_d, bits); -} - -// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) -static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - -static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); - const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); - const __m128i v_tmp_d = - _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); - return _mm_srai_epi16(v_tmp_d, bits); -} - -#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h deleted file mode 100644 index 3f69b120e..000000000 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ -#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ - -#include <immintrin.h> - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -/** - * Various reusable shorthands for x86 SIMD intrinsics. - * - * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. - * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. - */ - -// Loads and stores to do away with the tedium of casting the address -// to the right type. -static INLINE __m256i yy_load_256(const void *a) { - return _mm256_load_si256((const __m256i *)a); -} - -static INLINE __m256i yy_loadu_256(const void *a) { - return _mm256_loadu_si256((const __m256i *)a); -} - -static INLINE void yy_store_256(void *const a, const __m256i v) { - _mm256_store_si256((__m256i *)a, v); -} - -static INLINE void yy_storeu_256(void *const a, const __m256i v) { - _mm256_storeu_si256((__m256i *)a, v); -} - -// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio -// compilers. The following function is equivalent to _mm256_set1_epi64x() -// acting on a 32-bit integer. -static INLINE __m256i yy_set1_64_from_32i(int32_t a) { -#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); -#else - return _mm256_set1_epi64x((uint32_t)a); -#endif -} - -// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We -// therefore define an equivalent function using a different intrinsic. -// ([ hi ], [ lo ]) -> [ hi ][ lo ] -static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); -} - -static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { - __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); - __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); - return yy_set_m128i(mhi, mlo); -} - -static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { - const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); - return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); -} -#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h deleted file mode 100644 index d0d1ee684..000000000 --- a/third_party/aom/aom_dsp/x86/transpose_sse2.h +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ -#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" - -static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 - // in[1]: 10 11 12 13 - // in[2]: 20 21 22 23 - // in[3]: 30 31 32 33 - // to: - // a0: 00 10 01 11 02 12 03 13 - // a1: 20 30 21 31 22 32 23 33 - const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); - - // Unpack 32 bit elements resulting in: - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - return _mm_unpacklo_epi16(a0, a1); -} - -static INLINE void transpose_8bit_8x8(const __m128i *const in, - __m128i *const out) { - // Unpack 8 bit elements. Goes from: - // in[0]: 00 01 02 03 04 05 06 07 - // in[1]: 10 11 12 13 14 15 16 17 - // in[2]: 20 21 22 23 24 25 26 27 - // in[3]: 30 31 32 33 34 35 36 37 - // in[4]: 40 41 42 43 44 45 46 47 - // in[5]: 50 51 52 53 54 55 56 57 - // in[6]: 60 61 62 63 64 65 66 67 - // in[7]: 70 71 72 73 74 75 76 77 - // to: - // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); - const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); - const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); - - // Unpack 16 bit elements resulting in: - // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - const __m128i b0 = _mm_unpacklo_epi16(a0, a1); - const __m128i b1 = _mm_unpackhi_epi16(a0, a1); - const __m128i b2 = _mm_unpacklo_epi16(a2, a3); - const __m128i b3 = _mm_unpackhi_epi16(a2, a3); - - // Unpack 32 bit elements resulting in: - // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - const __m128i c0 = _mm_unpacklo_epi32(b0, b2); - const __m128i c1 = _mm_unpackhi_epi32(b0, b2); - const __m128i c2 = _mm_unpacklo_epi32(b1, b3); - const __m128i c3 = _mm_unpackhi_epi32(b1, b3); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 40 50 60 70 - // out[1]: 01 11 21 31 41 51 61 71 - // out[2]: 02 12 22 32 42 52 62 72 - // out[3]: 03 13 23 33 43 53 63 73 - // out[4]: 04 14 24 34 44 54 64 74 - // out[5]: 05 15 25 35 45 55 65 75 - // out[6]: 06 16 26 36 46 56 66 76 - // out[7]: 07 17 27 37 47 57 67 77 - out[0] = _mm_unpacklo_epi64(c0, c0); - out[1] = _mm_unpackhi_epi64(c0, c0); - out[2] = _mm_unpacklo_epi64(c1, c1); - out[3] = _mm_unpackhi_epi64(c1, c1); - out[4] = _mm_unpacklo_epi64(c2, c2); - out[5] = _mm_unpackhi_epi64(c2, c2); - out[6] = _mm_unpacklo_epi64(c3, c3); - out[7] = _mm_unpackhi_epi64(c3, c3); -} - -static INLINE void transpose_16bit_4x4(const __m128i *const in, - __m128i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 XX XX XX XX - // in[1]: 10 11 12 13 XX XX XX XX - // in[2]: 20 21 22 23 XX XX XX XX - // in[3]: 30 31 32 33 XX XX XX XX - // to: - // a0: 00 10 01 11 02 12 03 13 - // a1: 20 30 21 31 22 32 23 33 - const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); - - // Unpack 32 bit elements resulting in: - // out[0]: 00 10 20 30 - // out[1]: 01 11 21 31 - // out[2]: 02 12 22 32 - // out[3]: 03 13 23 33 - out[0] = _mm_unpacklo_epi32(a0, a1); - out[1] = _mm_srli_si128(out[0], 8); - out[2] = _mm_unpackhi_epi32(a0, a1); - out[3] = _mm_srli_si128(out[2], 8); -} - -static INLINE void transpose_16bit_4x8(const __m128i *const in, - __m128i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 XX XX XX XX - // in[1]: 10 11 12 13 XX XX XX XX - // in[2]: 20 21 22 23 XX XX XX XX - // in[3]: 30 31 32 33 XX XX XX XX - // in[4]: 40 41 42 43 XX XX XX XX - // in[5]: 50 51 52 53 XX XX XX XX - // in[6]: 60 61 62 63 XX XX XX XX - // in[7]: 70 71 72 73 XX XX XX XX - // to: - // a0: 00 10 01 11 02 12 03 13 - // a1: 20 30 21 31 22 32 23 33 - // a2: 40 50 41 51 42 52 43 53 - // a3: 60 70 61 71 62 72 63 73 - const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); - - // Unpack 32 bit elements resulting in: - // b0: 00 10 20 30 01 11 21 31 - // b1: 40 50 60 70 41 51 61 71 - // b2: 02 12 22 32 03 13 23 33 - // b3: 42 52 62 72 43 53 63 73 - const __m128i b0 = _mm_unpacklo_epi32(a0, a1); - const __m128i b1 = _mm_unpacklo_epi32(a2, a3); - const __m128i b2 = _mm_unpackhi_epi32(a0, a1); - const __m128i b3 = _mm_unpackhi_epi32(a2, a3); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 40 50 60 70 - // out[1]: 01 11 21 31 41 51 61 71 - // out[2]: 02 12 22 32 42 52 62 72 - // out[3]: 03 13 23 33 43 53 63 73 - out[0] = _mm_unpacklo_epi64(b0, b1); - out[1] = _mm_unpackhi_epi64(b0, b1); - out[2] = _mm_unpacklo_epi64(b2, b3); - out[3] = _mm_unpackhi_epi64(b2, b3); -} - -static INLINE void transpose_16bit_8x4(const __m128i *const in, - __m128i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 04 05 06 07 - // in[1]: 10 11 12 13 14 15 16 17 - // in[2]: 20 21 22 23 24 25 26 27 - // in[3]: 30 31 32 33 34 35 36 37 - - // to: - // a0: 00 10 01 11 02 12 03 13 - // a1: 20 30 21 31 22 32 23 33 - // a4: 04 14 05 15 06 16 07 17 - // a5: 24 34 25 35 26 36 27 37 - const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); - - // Unpack 32 bit elements resulting in: - // b0: 00 10 20 30 01 11 21 31 - // b2: 04 14 24 34 05 15 25 35 - // b4: 02 12 22 32 03 13 23 33 - // b6: 06 16 26 36 07 17 27 37 - const __m128i b0 = _mm_unpacklo_epi32(a0, a1); - const __m128i b2 = _mm_unpacklo_epi32(a4, a5); - const __m128i b4 = _mm_unpackhi_epi32(a0, a1); - const __m128i b6 = _mm_unpackhi_epi32(a4, a5); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 XX XX XX XX - // out[1]: 01 11 21 31 XX XX XX XX - // out[2]: 02 12 22 32 XX XX XX XX - // out[3]: 03 13 23 33 XX XX XX XX - // out[4]: 04 14 24 34 XX XX XX XX - // out[5]: 05 15 25 35 XX XX XX XX - // out[6]: 06 16 26 36 XX XX XX XX - // out[7]: 07 17 27 37 XX XX XX XX - const __m128i zeros = _mm_setzero_si128(); - out[0] = _mm_unpacklo_epi64(b0, zeros); - out[1] = _mm_unpackhi_epi64(b0, zeros); - out[2] = _mm_unpacklo_epi64(b4, zeros); - out[3] = _mm_unpackhi_epi64(b4, zeros); - out[4] = _mm_unpacklo_epi64(b2, zeros); - out[5] = _mm_unpackhi_epi64(b2, zeros); - out[6] = _mm_unpacklo_epi64(b6, zeros); - out[7] = _mm_unpackhi_epi64(b6, zeros); -} - -static INLINE void transpose_16bit_8x8(const __m128i *const in, - __m128i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 04 05 06 07 - // in[1]: 10 11 12 13 14 15 16 17 - // in[2]: 20 21 22 23 24 25 26 27 - // in[3]: 30 31 32 33 34 35 36 37 - // in[4]: 40 41 42 43 44 45 46 47 - // in[5]: 50 51 52 53 54 55 56 57 - // in[6]: 60 61 62 63 64 65 66 67 - // in[7]: 70 71 72 73 74 75 76 77 - // to: - // a0: 00 10 01 11 02 12 03 13 - // a1: 20 30 21 31 22 32 23 33 - // a2: 40 50 41 51 42 52 43 53 - // a3: 60 70 61 71 62 72 63 73 - // a4: 04 14 05 15 06 16 07 17 - // a5: 24 34 25 35 26 36 27 37 - // a6: 44 54 45 55 46 56 47 57 - // a7: 64 74 65 75 66 76 67 77 - const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); - - // Unpack 32 bit elements resulting in: - // b0: 00 10 20 30 01 11 21 31 - // b1: 40 50 60 70 41 51 61 71 - // b2: 04 14 24 34 05 15 25 35 - // b3: 44 54 64 74 45 55 65 75 - // b4: 02 12 22 32 03 13 23 33 - // b5: 42 52 62 72 43 53 63 73 - // b6: 06 16 26 36 07 17 27 37 - // b7: 46 56 66 76 47 57 67 77 - const __m128i b0 = _mm_unpacklo_epi32(a0, a1); - const __m128i b1 = _mm_unpacklo_epi32(a2, a3); - const __m128i b2 = _mm_unpacklo_epi32(a4, a5); - const __m128i b3 = _mm_unpacklo_epi32(a6, a7); - const __m128i b4 = _mm_unpackhi_epi32(a0, a1); - const __m128i b5 = _mm_unpackhi_epi32(a2, a3); - const __m128i b6 = _mm_unpackhi_epi32(a4, a5); - const __m128i b7 = _mm_unpackhi_epi32(a6, a7); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 40 50 60 70 - // out[1]: 01 11 21 31 41 51 61 71 - // out[2]: 02 12 22 32 42 52 62 72 - // out[3]: 03 13 23 33 43 53 63 73 - // out[4]: 04 14 24 34 44 54 64 74 - // out[5]: 05 15 25 35 45 55 65 75 - // out[6]: 06 16 26 36 46 56 66 76 - // out[7]: 07 17 27 37 47 57 67 77 - out[0] = _mm_unpacklo_epi64(b0, b1); - out[1] = _mm_unpackhi_epi64(b0, b1); - out[2] = _mm_unpacklo_epi64(b4, b5); - out[3] = _mm_unpackhi_epi64(b4, b5); - out[4] = _mm_unpacklo_epi64(b2, b3); - out[5] = _mm_unpackhi_epi64(b2, b3); - out[6] = _mm_unpacklo_epi64(b6, b7); - out[7] = _mm_unpackhi_epi64(b6, b7); -} - -// Transpose in-place -static INLINE void transpose_16bit_16x16(__m128i *const left, - __m128i *const right) { - __m128i tbuf[8]; - transpose_16bit_8x8(left, left); - transpose_16bit_8x8(right, tbuf); - transpose_16bit_8x8(left + 8, right); - transpose_16bit_8x8(right + 8, right + 8); - - left[8] = tbuf[0]; - left[9] = tbuf[1]; - left[10] = tbuf[2]; - left[11] = tbuf[3]; - left[12] = tbuf[4]; - left[13] = tbuf[5]; - left[14] = tbuf[6]; - left[15] = tbuf[7]; -} - -static INLINE void transpose_32bit_4x4(const __m128i *const in, - __m128i *const out) { - // Unpack 32 bit elements. Goes from: - // in[0]: 00 01 02 03 - // in[1]: 10 11 12 13 - // in[2]: 20 21 22 23 - // in[3]: 30 31 32 33 - // to: - // a0: 00 10 01 11 - // a1: 20 30 21 31 - // a2: 02 12 03 13 - // a3: 22 32 23 33 - - const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); - const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); - const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 - // out[1]: 01 11 21 31 - // out[2]: 02 12 22 32 - // out[3]: 03 13 23 33 - out[0] = _mm_unpacklo_epi64(a0, a1); - out[1] = _mm_unpackhi_epi64(a0, a1); - out[2] = _mm_unpacklo_epi64(a2, a3); - out[3] = _mm_unpackhi_epi64(a2, a3); -} - -static INLINE void transpose_32bit_4x4x2(const __m128i *const in, - __m128i *const out) { - // Unpack 32 bit elements. Goes from: - // in[0]: 00 01 02 03 - // in[1]: 10 11 12 13 - // in[2]: 20 21 22 23 - // in[3]: 30 31 32 33 - // in[4]: 04 05 06 07 - // in[5]: 14 15 16 17 - // in[6]: 24 25 26 27 - // in[7]: 34 35 36 37 - // to: - // a0: 00 10 01 11 - // a1: 20 30 21 31 - // a2: 02 12 03 13 - // a3: 22 32 23 33 - // a4: 04 14 05 15 - // a5: 24 34 25 35 - // a6: 06 16 07 17 - // a7: 26 36 27 37 - const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); - const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); - const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); - const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); - const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); - const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); - const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); - const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 - // out[1]: 01 11 21 31 - // out[2]: 02 12 22 32 - // out[3]: 03 13 23 33 - // out[4]: 04 14 24 34 - // out[5]: 05 15 25 35 - // out[6]: 06 16 26 36 - // out[7]: 07 17 27 37 - out[0] = _mm_unpacklo_epi64(a0, a1); - out[1] = _mm_unpackhi_epi64(a0, a1); - out[2] = _mm_unpacklo_epi64(a2, a3); - out[3] = _mm_unpackhi_epi64(a2, a3); - out[4] = _mm_unpacklo_epi64(a4, a5); - out[5] = _mm_unpackhi_epi64(a4, a5); - out[6] = _mm_unpacklo_epi64(a6, a7); - out[7] = _mm_unpackhi_epi64(a6, a7); -} - -static INLINE void transpose_32bit_8x4(const __m128i *const in, - __m128i *const out) { - // Unpack 32 bit elements. Goes from: - // in[0]: 00 01 02 03 - // in[1]: 04 05 06 07 - // in[2]: 10 11 12 13 - // in[3]: 14 15 16 17 - // in[4]: 20 21 22 23 - // in[5]: 24 25 26 27 - // in[6]: 30 31 32 33 - // in[7]: 34 35 36 37 - // to: - // a0: 00 10 01 11 - // a1: 20 30 21 31 - // a2: 02 12 03 13 - // a3: 22 32 23 33 - // a4: 04 14 05 15 - // a5: 24 34 25 35 - // a6: 06 16 07 17 - // a7: 26 36 27 37 - const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); - const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); - const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); - const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); - const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); - const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); - const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); - const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); - - // Unpack 64 bit elements resulting in: - // out[0]: 00 10 20 30 - // out[1]: 01 11 21 31 - // out[2]: 02 12 22 32 - // out[3]: 03 13 23 33 - // out[4]: 04 14 24 34 - // out[5]: 05 15 25 35 - // out[6]: 06 16 26 36 - // out[7]: 07 17 27 37 - out[0] = _mm_unpacklo_epi64(a0, a1); - out[1] = _mm_unpackhi_epi64(a0, a1); - out[2] = _mm_unpacklo_epi64(a2, a3); - out[3] = _mm_unpackhi_epi64(a2, a3); - out[4] = _mm_unpacklo_epi64(a4, a5); - out[5] = _mm_unpackhi_epi64(a4, a5); - out[6] = _mm_unpacklo_epi64(a6, a7); - out[7] = _mm_unpackhi_epi64(a6, a7); -} - -#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h deleted file mode 100644 index b1611ba87..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ -#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ - -#include <emmintrin.h> -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, - int8_t cos_bit); - -static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { - return _mm256_set1_epi32( - (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); -} - -static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, - __m256i *in0, __m256i *in1, const __m256i _r, - const int32_t cos_bit) { - __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); - __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); - __m256i u0 = _mm256_madd_epi16(t0, w0); - __m256i u1 = _mm256_madd_epi16(t1, w0); - __m256i v0 = _mm256_madd_epi16(t0, w1); - __m256i v1 = _mm256_madd_epi16(t1, w1); - - __m256i a0 = _mm256_add_epi32(u0, _r); - __m256i a1 = _mm256_add_epi32(u1, _r); - __m256i b0 = _mm256_add_epi32(v0, _r); - __m256i b1 = _mm256_add_epi32(v1, _r); - - __m256i c0 = _mm256_srai_epi32(a0, cos_bit); - __m256i c1 = _mm256_srai_epi32(a1, cos_bit); - __m256i d0 = _mm256_srai_epi32(b0, cos_bit); - __m256i d1 = _mm256_srai_epi32(b1, cos_bit); - - *in0 = _mm256_packs_epi32(c0, c1); - *in1 = _mm256_packs_epi32(d0, d1); -} - -static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { - const __m256i _in0 = *in0; - const __m256i _in1 = *in1; - *in0 = _mm256_adds_epi16(_in0, _in1); - *in1 = _mm256_subs_epi16(_in0, _in1); -} - -static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { - const __m256i _in0 = *in0; - const __m256i _in1 = *in1; - *in0 = _mm256_add_epi32(_in0, _in1); - *in1 = _mm256_sub_epi32(_in0, _in1); -} - -static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, - __m256i in0, __m256i in1) { - const __m256i _in0 = in0; - const __m256i _in1 = in1; - *out0 = _mm256_adds_epi16(_in0, _in1); - *out1 = _mm256_subs_epi16(_in0, _in1); -} - -static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, - __m256i in0, __m256i in1) { - const __m256i _in0 = in0; - const __m256i _in1 = in1; - *out0 = _mm256_add_epi32(_in0, _in1); - *out1 = _mm256_sub_epi32(_in0, _in1); -} - -static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { - return _mm256_load_si256((const __m256i *)a); -} - -static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, - int stride, __m256i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_16bit_to_16bit_avx2(in + i * stride); - } -} - -static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, - int stride, - __m256i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); - } -} - -static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { - const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); - const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); - return _mm256_permute4x64_epi64(b, 0xD8); -} - -static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, - int stride, __m256i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); - } -} - -static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, - __m256i *const out) { - // Unpack 16 bit elements. Goes from: - // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f - // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f - // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f - // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f - // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f - // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f - // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f - // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f - // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f - // to: - // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - // ... - __m256i a[16]; - for (int i = 0; i < 16; i += 2) { - a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); - a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); - } - __m256i b[16]; - for (int i = 0; i < 16; i += 2) { - b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); - b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); - } - __m256i c[16]; - for (int i = 0; i < 16; i += 2) { - c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); - c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); - } - out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); - out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); - out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); - out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); - - out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); - out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); - out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); - out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); - - out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); - out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); - out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); - out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); - - out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); - out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); - out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); - out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); -} - -static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { - for (int i = 0; i < size; ++i) { - out[size - i - 1] = in[i]; - } -} - -static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { - if (bit < 0) { - bit = -bit; - __m256i round = _mm256_set1_epi16(1 << (bit - 1)); - for (int i = 0; i < size; ++i) { - in[i] = _mm256_adds_epi16(in[i], round); - in[i] = _mm256_srai_epi16(in[i], bit); - } - } else if (bit > 0) { - for (int i = 0; i < size; ++i) { - in[i] = _mm256_slli_epi16(in[i], bit); - } - } -} - -#ifdef __cplusplus -} -#endif - -#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h deleted file mode 100644 index ed82eee96..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ -#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ - -#include <emmintrin.h> -#include "aom/aom_integer.h" -#include "aom_dsp/x86/synonyms.h" - -#define pair_set_epi16(a, b) \ - _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) - -// Reverse the 8 16 bit words in __m128i -static INLINE __m128i mm_reverse_epi16(const __m128i x) { - const __m128i a = _mm_shufflelo_epi16(x, 0x1b); - const __m128i b = _mm_shufflehi_epi16(a, 0x1b); - return _mm_shuffle_epi32(b, 0x4e); -} - -#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c deleted file mode 100644 index 800aef126..000000000 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" - -static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { - return _mm_add_epi16(_mm256_castsi256_si128(val), - _mm256_extractf128_si256(val, 1)); -} - -static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { - return _mm_add_epi32(_mm256_castsi256_si128(val), - _mm256_extractf128_si256(val, 1)); -} - -static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, - __m256i *const sse, - __m256i *const sum) { - const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) - - // unpack into pairs of source and reference values - const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); - const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); - - // subtract adjacent elements using src*1 + ref*-1 - const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); - const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); - const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); - const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); - - // add to the running totals - *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); - *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); -} - -static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, - unsigned int *const sse) { - // extract the low lane and add it to the high lane - const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); - - // unpack sse and sum registers and add - const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); - const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); - const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); - - // perform the final summation and extract the results - const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); - *((int *)sse) = _mm_cvtsi128_si32(res); - return _mm_extract_epi32(res, 1); -} - -// handle pixels (<= 512) -static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, - unsigned int *const sse) { - // extract the low lane and add it to the high lane - const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); - const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); - const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); - return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); -} - -// handle 1024 pixels (32x32, 16x64, 64x16) -static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, - unsigned int *const sse) { - // extract the low lane and add it to the high lane - const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); - const __m128i vsum_64 = - _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), - _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); - return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); -} - -static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { - const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); - const __m256i sum_hi = - _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); - return _mm256_add_epi32(sum_lo, sum_hi); -} - -// handle 2048 pixels (32x64, 64x32) -static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, - unsigned int *const sse) { - vsum = sum_to_32bit_avx2(vsum); - const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); - return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); -} - -static INLINE void variance16_kernel_avx2( - const uint8_t *const src, const int src_stride, const uint8_t *const ref, - const int ref_stride, __m256i *const sse, __m256i *const sum) { - const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); - const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); - const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); - const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); - const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); - const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); - variance_kernel_avx2(s, r, sse, sum); -} - -static INLINE void variance32_kernel_avx2(const uint8_t *const src, - const uint8_t *const ref, - __m256i *const sse, - __m256i *const sum) { - const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); - const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); - variance_kernel_avx2(s, r, sse, sum); -} - -static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m256i *const vsse, - __m256i *const vsum) { - *vsum = _mm256_setzero_si256(); - - for (int i = 0; i < h; i += 2) { - variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); - src += 2 * src_stride; - ref += 2 * ref_stride; - } -} - -static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m256i *const vsse, - __m256i *const vsum) { - *vsum = _mm256_setzero_si256(); - - for (int i = 0; i < h; i++) { - variance32_kernel_avx2(src, ref, vsse, vsum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m256i *const vsse, - __m256i *const vsum) { - *vsum = _mm256_setzero_si256(); - - for (int i = 0; i < h; i++) { - variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); - variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m256i *const vsse, - __m256i *const vsum) { - *vsum = _mm256_setzero_si256(); - - for (int i = 0; i < h; i++) { - variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); - variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); - variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); - variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); - src += src_stride; - ref += ref_stride; - } -} - -#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ - unsigned int aom_variance##bw##x##bh##_avx2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - unsigned int *sse) { \ - __m256i vsse = _mm256_setzero_si256(); \ - __m256i vsum; \ - variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ - const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ - } - -AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512); -AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512); -AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512); -AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512); -AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024); - -AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512); -AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512); -AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024); -AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048); - -AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024); -AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048); - -#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ - unsigned int aom_variance##bw##x##bh##_avx2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - unsigned int *sse) { \ - __m256i vsse = _mm256_setzero_si256(); \ - __m256i vsum = _mm256_setzero_si256(); \ - for (int i = 0; i < (bh / uh); i++) { \ - __m256i vsum16; \ - variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ - &vsum16); \ - vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ - src += uh * src_stride; \ - ref += uh * ref_stride; \ - } \ - const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ - const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ - return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ - } - -AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32) -AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32) -AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16) -AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16) - -unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse); - -unsigned int aom_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sseptr); - -#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ - unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ - &sse2); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ - } - -AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); -AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); -AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); -AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); -AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); -AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); -AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); -AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); - -#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ - const uint8_t *sec) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - const uint8_t *sec_ptr = sec; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ - sec_ptr, w, hf, &sse2); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - sec_ptr += hf * w; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - sec += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ - } - -AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); -AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); -AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); -AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); -AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); -AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); -AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); -AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); - -static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { - const __m256i d = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); - return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); -} - -static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { - const __m256i d = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); - return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); -} - -static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, - const __m256i a, - uint8_t *comp_pred) { - const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); - const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; - const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); - - const __m256i ma = _mm256_sub_epi8(alpha_max, a); - - const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); - const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); - const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); - const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); - - const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); - const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); - const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); - const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); - - const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); - _mm256_storeu_si256((__m256i *)(comp_pred), roundA); -} - -void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride, - const uint8_t *mask, int mask_stride, - int invert_mask) { - int i = 0; - const uint8_t *src0 = invert_mask ? pred : ref; - const uint8_t *src1 = invert_mask ? ref : pred; - const int stride0 = invert_mask ? width : ref_stride; - const int stride1 = invert_mask ? ref_stride : width; - if (width == 8) { - comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, - mask, mask_stride); - } else if (width == 16) { - do { - const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); - const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); - const __m256i aA = mm256_loadu2(mask + mask_stride, mask); - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); - const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); - const __m256i aB = mm256_loadu2(mask + mask_stride, mask); - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - // comp_pred's stride == width == 16 - comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); - comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); - comp_pred += (16 << 2); - i += 4; - } while (i < height); - } else { // for width == 32 - do { - const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0)); - const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1)); - const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask)); - - const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0)); - const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1)); - const __m256i aB = - _mm256_lddqu_si256((const __m256i *)(mask + mask_stride)); - - comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); - comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); - comp_pred += (32 << 1); - - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - i += 2; - } while (i < height); - } -} - -static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, - const __m256i s1, - const __m256i a) { - const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m256i round_const = - _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); - - const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); - const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); - const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); - const __m256i pred_l = _mm256_srai_epi32( - _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); - - const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); - const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); - const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); - const __m256i pred_h = _mm256_srai_epi32( - _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); - - const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); - - return comp; -} - -void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { - int i = 0; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - const uint16_t *src0 = invert_mask ? pred : ref; - const uint16_t *src1 = invert_mask ? ref : pred; - const int stride0 = invert_mask ? width : ref_stride; - const int stride1 = invert_mask ? ref_stride : width; - const __m256i zero = _mm256_setzero_si256(); - - if (width == 8) { - do { - const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); - const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); - - const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); - const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); - - __m256i m = _mm256_castsi128_si256(m_l); - m = _mm256_insertf128_si256(m, m_h, 1); - const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); - - const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); - - _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); - - _mm_storeu_si128((__m128i *)(comp_pred + width), - _mm256_extractf128_si256(comp, 1)); - - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += (mask_stride << 1); - comp_pred += (width << 1); - i += 2; - } while (i < height); - } else if (width == 16) { - do { - const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); - const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); - const __m256i m_16 = - _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); - - const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); - - _mm256_storeu_si256((__m256i *)comp_pred, comp); - - src0 += stride0; - src1 += stride1; - mask += mask_stride; - comp_pred += width; - i += 1; - } while (i < height); - } else if (width == 32) { - do { - const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); - const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); - const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); - const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); - - const __m256i m01_16 = - _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); - const __m256i m23_16 = - _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); - - const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); - const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); - - _mm256_storeu_si256((__m256i *)comp_pred, comp); - _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); - - src0 += stride0; - src1 += stride1; - mask += mask_stride; - comp_pred += width; - i += 1; - } while (i < height); - } -} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c deleted file mode 100644 index 88e27aef3..000000000 --- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> // AVX2 - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" - -/* clang-format off */ -DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, -}; -/* clang-format on */ - -#define FILTER_SRC(filter) \ - /* filter the source */ \ - exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ - exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ - exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); - -#define MERGE_WITH_SRC(src_reg, reg) \ - exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ - exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); - -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); - -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - /* average between current and next stride source */ \ - src_reg = _mm256_avg_epu8(src_reg, src_next_reg); - -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - MERGE_WITH_SRC(src_reg, src_next_reg) - -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ - exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ - exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); - -// final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - -unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse) { - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - // save current source average - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src_pack = src_reg; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - _mm256_zeroupper(); - return sum; -} - -unsigned int aom_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { - __m256i sec_reg; - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - MERGE_WITH_SRC(src_reg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - _mm256_zeroupper(); - return sum; -} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c deleted file mode 100644 index 66b0d7d84..000000000 --- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" - -void aom_var_filter_block2d_bil_first_pass_ssse3( - const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow - // in computation using _mm_maddubs_epi16. - // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. - const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; - const __m128i r = _mm_set1_epi16(round); - const uint8_t f0 = filter[0] >> 1; - const uint8_t f1 = filter[1] >> 1; - const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, - f0, f1, f0, f1, f0, f1); - unsigned int i, j; - (void)pixel_step; - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - // load source - __m128i source_low = xx_loadl_64(a); - __m128i source_hi = xx_loadl_64(a + 1); - - // unpack to: - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source = _mm_unpacklo_epi8(source_low, source_hi); - - // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] - __m128i res = _mm_maddubs_epi16(source, filters); - - // round - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storeu_128(b, res); - - a += 8; - b += 8; - } - - a += src_pixels_per_line - output_width; - } - } else { - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); - for (i = 0; i < output_height; ++i) { - // load source, only first 5 values are meaningful: - // { a[0], a[1], a[2], a[3], a[4], xxxx } - __m128i source = xx_loadl_64(a); - - // shuffle, up to the first 8 are useful - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storel_64(b, res); - - a += src_pixels_per_line; - b += output_width; - } - } -} - -void aom_var_filter_block2d_bil_second_pass_ssse3( - const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, - unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - const int16_t round = (1 << FILTER_BITS) >> 1; - const __m128i r = _mm_set1_epi32(round); - const __m128i filters = - _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], - filter[1], filter[0], filter[1]); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - const __m128i mask = - _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - // load source as: - // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } - __m128i source1 = xx_loadl_64(a); - __m128i source2 = xx_loadl_64(a + pixel_step); - __m128i source = _mm_unpacklo_epi64(source1, source2); - - // shuffle source to: - // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[w + i] * filter[1] - __m128i res = _mm_madd_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); - - // shuffle to get each lower 8 bit of every 32 bit - res = _mm_shuffle_epi8(res, mask); - - xx_storel_32(b, res); - - a += 4; - b += 4; - } - - a += src_pixels_per_line - output_width; - } -} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c deleted file mode 100644 index 3c37e77c0..000000000 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ /dev/null @@ -1,806 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" -#include "config/av1_rtcd.h" - -#include "aom_dsp/blend.h" -#include "aom_dsp/x86/synonyms.h" - -#include "aom_ports/mem.h" - -#include "av1/common/filter.h" -#include "av1/common/onyxc_int.h" -#include "av1/common/reconinter.h" - -unsigned int aom_get_mb_ss_sse2(const int16_t *src) { - __m128i vsum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 32; ++i) { - const __m128i v = xx_loadu_128(src); - vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; - } - - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); -} - -static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { - const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); - const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); - return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); -} - -static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { - const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); - return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); -} - -// Accumulate 4 32bit numbers in val to 1 32bit number -static INLINE unsigned int add32x4_sse2(__m128i val) { - val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); - val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); - return _mm_cvtsi128_si32(val); -} - -// Accumulate 8 16bit in sum to 4 32bit number -static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { - const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); - const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); - return _mm_add_epi32(sum_lo, sum_hi); -} - -static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, - __m128i *const sse, - __m128i *const sum) { - const __m128i diff = _mm_sub_epi16(src, ref); - *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); - *sum = _mm_add_epi16(*sum, diff); -} - -// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) -// Slightly faster than variance_final_256_pel_sse2() -// diff sum of 128 pixels can still fit in 16bit integer -static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, - unsigned int *const sse, - int *const sum) { - *sse = add32x4_sse2(vsse); - - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); -} - -// Can handle 256 pixels' diff sum (such as 16x16) -static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, - unsigned int *const sse, - int *const sum) { - *sse = add32x4_sse2(vsse); - - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - *sum += (int16_t)_mm_extract_epi16(vsum, 1); -} - -// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) -static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, - unsigned int *const sse, - int *const sum) { - *sse = add32x4_sse2(vsse); - - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_unpacklo_epi16(vsum, vsum); - vsum = _mm_srai_epi32(vsum, 16); - *sum = add32x4_sse2(vsum); -} - -// Can handle 1024 pixels' diff sum (such as 32x32) -static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, - unsigned int *const sse, - int *const sum) { - *sse = add32x4_sse2(vsse); - - vsum = sum_to_32bit_sse2(vsum); - *sum = add32x4_sse2(vsum); -} - -static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 256); // May overflow for larger height. - *sum = _mm_setzero_si128(); - - for (int i = 0; i < h; i += 2) { - const __m128i s = load4x2_sse2(src, src_stride); - const __m128i r = load4x2_sse2(ref, ref_stride); - - variance_kernel_sse2(s, r, sse, sum); - src += 2 * src_stride; - ref += 2 * ref_stride; - } -} - -static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 128); // May overflow for larger height. - *sum = _mm_setzero_si128(); - for (int i = 0; i < h; i++) { - const __m128i s = load8_8to16_sse2(src); - const __m128i r = load8_8to16_sse2(ref); - - variance_kernel_sse2(s, r, sse, sum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance16_kernel_sse2(const uint8_t *const src, - const uint8_t *const ref, - __m128i *const sse, - __m128i *const sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - - variance_kernel_sse2(src0, ref0, sse, sum); - variance_kernel_sse2(src1, ref1, sse, sum); -} - -static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 64); // May overflow for larger height. - *sum = _mm_setzero_si128(); - - for (int i = 0; i < h; ++i) { - variance16_kernel_sse2(src, ref, sse, sum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 32); // May overflow for larger height. - // Don't initialize sse here since it's an accumulation. - *sum = _mm_setzero_si128(); - - for (int i = 0; i < h; ++i) { - variance16_kernel_sse2(src + 0, ref + 0, sse, sum); - variance16_kernel_sse2(src + 16, ref + 16, sse, sum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 16); // May overflow for larger height. - *sum = _mm_setzero_si128(); - - for (int i = 0; i < h; ++i) { - variance16_kernel_sse2(src + 0, ref + 0, sse, sum); - variance16_kernel_sse2(src + 16, ref + 16, sse, sum); - variance16_kernel_sse2(src + 32, ref + 32, sse, sum); - variance16_kernel_sse2(src + 48, ref + 48, sse, sum); - src += src_stride; - ref += ref_stride; - } -} - -static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, - const int h, __m128i *const sse, - __m128i *const sum) { - assert(h <= 8); // May overflow for larger height. - *sum = _mm_setzero_si128(); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < 4; ++j) { - const int offset0 = j << 5; - const int offset1 = offset0 + 16; - variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); - variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); - } - src += src_stride; - ref += ref_stride; - } -} - -#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ - unsigned int aom_variance##bw##x##bh##_sse2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - unsigned int *sse) { \ - __m128i vsse = _mm_setzero_si128(); \ - __m128i vsum; \ - int sum = 0; \ - variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ - variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ - assert(sum <= 255 * bw * bh); \ - assert(sum >= -255 * bw * bh); \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ - } - -AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128); -AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128); -AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128); - -AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128); -AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128); -AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128); -AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256); - -AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128); -AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128); -AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256); -AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512); -AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024); - -AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256); -AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512); -AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024); - -#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ - unsigned int aom_variance##bw##x##bh##_sse2( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - unsigned int *sse) { \ - __m128i vsse = _mm_setzero_si128(); \ - __m128i vsum = _mm_setzero_si128(); \ - for (int i = 0; i < (bh / uh); ++i) { \ - __m128i vsum16; \ - variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ - &vsum16); \ - vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ - src += (src_stride * uh); \ - ref += (ref_stride * uh); \ - } \ - *sse = add32x4_sse2(vsse); \ - int sum = add32x4_sse2(vsum); \ - assert(sum <= 255 * bw * bh); \ - assert(sum >= -255 * bw * bh); \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ - } - -AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 ) - -AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024); -AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 ) -AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 ) -AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 ) - -AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 ) -AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 ) - -unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -// The 2 unused parameters are place holders for PIC enabled build. -// These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int aom_sub_pixel_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ - void *unused0, void *unused) -#define DECLS(opt) \ - DECL(4, opt); \ - DECL(8, opt); \ - DECL(16, opt) - -DECLS(sse2); -DECLS(ssse3); -#undef DECLS -#undef DECL - -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ - &sse2, NULL, NULL); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ - } - -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ - FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) - -FNS(sse2); -FNS(ssse3); - -#undef FNS -#undef FN - -// The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int aom_sub_pixel_avg_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused) -#define DECLS(opt) \ - DECL(4, opt); \ - DECL(8, opt); \ - DECL(16, opt) - -DECLS(sse2); -DECLS(ssse3); -#undef DECL -#undef DECLS - -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ - const uint8_t *sec) { \ - /*Avoid overflow in helper by capping height.*/ \ - const int hf = AOMMIN(h, 64); \ - unsigned int sse = 0; \ - int se = 0; \ - for (int i = 0; i < (w / wf); ++i) { \ - const uint8_t *src_ptr = src; \ - const uint8_t *dst_ptr = dst; \ - const uint8_t *sec_ptr = sec; \ - for (int j = 0; j < (h / hf); ++j) { \ - unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ - sec_ptr, w, hf, &sse2, NULL, NULL); \ - dst_ptr += hf * dst_stride; \ - src_ptr += hf * src_stride; \ - sec_ptr += hf * w; \ - se += se2; \ - sse += sse2; \ - } \ - src += wf; \ - dst += wf; \ - sec += wf; \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ - } - -#define FNS(opt) \ - FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ - FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ - FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ - FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) - -FNS(sse2); -FNS(ssse3); - -#undef FNS -#undef FN - -void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, int width, int height, - int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref, int ref_stride, - int subpel_search) { - // expect xd == NULL only in tests - if (xd != NULL) { - const MB_MODE_INFO *mi = xd->mi[0]; - const int ref_num = 0; - const int is_intrabc = is_intrabc_block(mi); - const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; - const int is_scaled = av1_is_scaled(sf); - - if (is_scaled) { - // Note: This is mostly a copy from the >=8X8 case in - // build_inter_predictors() function, with some small tweaks. - - // Some assumptions. - const int plane = 0; - - // Get pre-requisites. - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int ssx = pd->subsampling_x; - const int ssy = pd->subsampling_y; - assert(ssx == 0 && ssy == 0); - const struct buf_2d *const dst_buf = &pd->dst; - const struct buf_2d *const pre_buf = - is_intrabc ? dst_buf : &pd->pre[ref_num]; - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - - // Calculate subpel_x/y and x/y_step. - const int row_start = 0; // Because ss_y is 0. - const int col_start = 0; // Because ss_x is 0. - const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; - const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; - int orig_pos_y = pre_y << SUBPEL_BITS; - orig_pos_y += mv->row * (1 << (1 - ssy)); - int orig_pos_x = pre_x << SUBPEL_BITS; - orig_pos_x += mv->col * (1 << (1 - ssx)); - int pos_y = sf->scale_value_y(orig_pos_y, sf); - int pos_x = sf->scale_value_x(orig_pos_x, sf); - pos_x += SCALE_EXTRA_OFF; - pos_y += SCALE_EXTRA_OFF; - - const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); - const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); - const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - const int right = (pre_buf->width + AOM_INTERP_EXTEND) - << SCALE_SUBPEL_BITS; - pos_y = clamp(pos_y, top, bottom); - pos_x = clamp(pos_x, left, right); - - const uint8_t *const pre = - pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + - (pos_x >> SCALE_SUBPEL_BITS); - - const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, - pos_x & SCALE_SUBPEL_MASK, - pos_y & SCALE_SUBPEL_MASK }; - - // Get warp types. - const WarpedMotionParams *const wm = - &xd->global_motion[mi->ref_frame[ref_num]]; - const int is_global = is_global_mv_block(mi, wm->wmtype); - WarpTypesAllowed warp_types; - warp_types.global_warp_allowed = is_global; - warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; - - // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - const InterpFilters filters = - av1_broadcast_interp_filter(EIGHTTAP_REGULAR); - - // Get the inter predictor. - const int build_for_obmc = 0; - av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, - &subpel_params, sf, width, height, &conv_params, - filters, &warp_types, mi_x >> pd->subsampling_x, - mi_y >> pd->subsampling_y, plane, ref_num, mi, - build_for_obmc, xd, cm->allow_warped_motion); - - return; - } - } - - const InterpFilterParams *filter = - (subpel_search == 1) - ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) - : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); - int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; - - if (!subpel_x_q3 && !subpel_y_q3) { - if (width >= 16) { - int i; - assert(!(width & 15)); - /*Read 16 pixels one row at a time.*/ - for (i = 0; i < height; i++) { - int j; - for (j = 0; j < width; j += 16) { - xx_storeu_128(comp_pred, xx_loadu_128(ref)); - comp_pred += 16; - ref += 16; - } - ref += ref_stride - width; - } - } else if (width >= 8) { - int i; - assert(!(width & 7)); - assert(!(height & 1)); - /*Read 8 pixels two rows at a time.*/ - for (i = 0; i < height; i += 2) { - __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); - __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); - xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); - comp_pred += 16; - ref += 2 * ref_stride; - } - } else { - int i; - assert(!(width & 3)); - assert(!(height & 3)); - /*Read 4 pixels four rows at a time.*/ - for (i = 0; i < height; i++) { - const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); - const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); - const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); - const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); - const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), - _mm_unpacklo_epi32(row2, row3)); - xx_storeu_128(comp_pred, reg); - comp_pred += 16; - ref += 4 * ref_stride; - } - } - } else if (!subpel_y_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, - width, height); - } else if (!subpel_x_q3) { - const int16_t *const kernel = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, - width, height); - } else { - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - const int16_t *const kernel_x = - av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - const int16_t *const kernel_y = - av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); - uint8_t *temp_start_horiz = - (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; - uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); - int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - // TODO(Deepa): Remove the memset below when we have - // 4 tap simd for sse2 and ssse3. - if (subpel_search == 1) { - memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); - memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); - memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); - memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); - } - aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, - kernel_x, 16, NULL, -1, width, intermediate_height); - aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, - kernel_y, 16, width, height); - } -} - -void aom_comp_avg_upsampled_pred_sse2( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, int subpel_search) { - int n; - int i; - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); - /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ - assert(!(width * height & 15)); - n = width * height >> 4; - for (i = 0; i < n; i++) { - __m128i s0 = xx_loadu_128(comp_pred); - __m128i p0 = xx_loadu_128(pred); - xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); - comp_pred += 16; - pred += 16; - } -} - -void aom_comp_mask_upsampled_pred_sse2( - MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int subpel_search) { - if (subpel_x_q3 | subpel_y_q3) { - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride, - subpel_search); - ref = comp_pred; - ref_stride = width; - } - aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, - mask_stride, invert_mask); -} - -static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, - const __m128i s1, - const __m128i a) { - const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); - const __m128i round_const = - _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); - const __m128i a_inv = _mm_sub_epi16(alpha_max, a); - - const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); - const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); - const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); - const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); - const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); - const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); - const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), - AOM_BLEND_A64_ROUND_BITS); - - const __m128i comp = _mm_packs_epi32(pred_l, pred_h); - - return comp; -} - -void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { - int i = 0; - uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - const uint16_t *src0 = invert_mask ? pred : ref; - const uint16_t *src1 = invert_mask ? ref : pred; - const int stride0 = invert_mask ? width : ref_stride; - const int stride1 = invert_mask ? ref_stride : width; - const __m128i zero = _mm_setzero_si128(); - - if (width == 8) { - do { - const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); - const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); - const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); - const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); - - const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); - - _mm_storeu_si128((__m128i *)comp_pred, comp); - - src0 += stride0; - src1 += stride1; - mask += mask_stride; - comp_pred += width; - i += 1; - } while (i < height); - } else if (width == 16) { - do { - const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); - const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); - const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); - const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); - - const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); - const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); - const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); - - const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); - const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); - - _mm_storeu_si128((__m128i *)comp_pred, comp); - _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); - - src0 += stride0; - src1 += stride1; - mask += mask_stride; - comp_pred += width; - i += 1; - } while (i < height); - } else if (width == 32) { - do { - for (int j = 0; j < 2; j++) { - const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); - const __m128i s2 = - _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); - const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); - const __m128i s3 = - _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); - - const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); - const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); - const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); - - const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); - const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); - - _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); - _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); - } - src0 += stride0; - src1 += stride1; - mask += mask_stride; - comp_pred += width; - i += 1; - } while (i < height); - } -} |