diff options
author | Matt A. Tobin <email@mattatobin.com> | 2020-04-07 23:30:51 -0400 |
---|---|---|
committer | wolfbeast <mcwerewolf@wolfbeast.com> | 2020-04-14 13:26:42 +0200 |
commit | 277f2116b6660e9bbe7f5d67524be57eceb49b8b (patch) | |
tree | 4595f7cc71418f71b9a97dfaeb03a30aa60f336a /third_party/aom/av1/common/x86 | |
parent | d270404436f6e84ffa3b92af537ac721bf10d66e (diff) | |
download | UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.gz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.lz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.tar.xz UXP-277f2116b6660e9bbe7f5d67524be57eceb49b8b.zip |
Move aom source to a sub-directory under media/libaom
There is no damned reason to treat this differently than any other media lib given its license and there never was.
Diffstat (limited to 'third_party/aom/av1/common/x86')
43 files changed, 0 insertions, 24693 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c deleted file mode 100644 index 8aa14696f..000000000 --- a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/convolve.h" -#include "av1/common/resize.h" -#include "aom_dsp/x86/synonyms.h" - -// Note: If the crop width is not a multiple of 4, then, unlike the C version, -// this function will overwrite some of the padding on the right hand side of -// the frame. This padding appears to be trashed anyway, so this should not -// affect the running of the decoder. -void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const int16_t *x_filters, int x0_qn, - int x_step_qn) { - assert(UPSCALE_NORMATIVE_TAPS == 8); - - src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; - - const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); - const __m128i zero = _mm_setzero_si128(); - - const uint8_t *src_y; - uint8_t *dst_y; - int x_qn = x0_qn; - for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { - const int x_filter_idx0 = - ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx1 = - ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx2 = - ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx3 = - ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - - assert(x_filter_idx0 <= RS_SUBPEL_MASK); - assert(x_filter_idx1 <= RS_SUBPEL_MASK); - assert(x_filter_idx2 <= RS_SUBPEL_MASK); - assert(x_filter_idx3 <= RS_SUBPEL_MASK); - - const int16_t *const x_filter0 = - &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter1 = - &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter2 = - &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter3 = - &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; - - const __m128i fil0_16 = xx_loadu_128(x_filter0); - const __m128i fil1_16 = xx_loadu_128(x_filter1); - const __m128i fil2_16 = xx_loadu_128(x_filter2); - const __m128i fil3_16 = xx_loadu_128(x_filter3); - - src_y = src; - dst_y = dst; - for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { - const uint8_t *const src_x0 = - &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint8_t *const src_x1 = - &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint8_t *const src_x2 = - &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint8_t *const src_x3 = - &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - - // Load up the source data. This is 8-bit input data, so each load - // gets 8 pixels. - const __m128i src0_8 = xx_loadl_64(src_x0); - const __m128i src1_8 = xx_loadl_64(src_x1); - const __m128i src2_8 = xx_loadl_64(src_x2); - const __m128i src3_8 = xx_loadl_64(src_x3); - - // Now zero-extend up to 16-bit precision, i.e. - // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] - const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); - const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); - const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); - const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); - - // Multiply by filter coefficients (results in a 32-bit value), - // and add adjacent pairs, i.e. - // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) - // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] - const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); - const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); - const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); - const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); - - // Reduce horizontally and add, i.e. - // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] - const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); - const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); - - const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); - - // Divide down by (1 << FILTER_BITS), rounding to nearest. - const __m128i shifted_32 = - _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); - - // Pack 32-bit values into 16-bit values, i.e. - // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] - const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); - - // Pack 16-bit values into 8-bit values, i.e. - // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) - // -> [ 0 0 0 0 0 0 DC BA ] - const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); - - // Write to the output - xx_storel_32(&dst_y[x], shifted_8); - } - } -} - -// Note: If the crop width is not a multiple of 4, then, unlike the C version, -// this function will overwrite some of the padding on the right hand side of -// the frame. This padding appears to be trashed anyway, so this should not -// affect the running of the decoder. -void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, - int h, const int16_t *x_filters, - int x0_qn, int x_step_qn, int bd) { - assert(UPSCALE_NORMATIVE_TAPS == 8); - assert(bd == 8 || bd == 10 || bd == 12); - - src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; - - const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); - const __m128i zero = _mm_setzero_si128(); - const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); - - const uint16_t *src_y; - uint16_t *dst_y; - int x_qn = x0_qn; - for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { - const int x_filter_idx0 = - ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx1 = - ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx2 = - ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - const int x_filter_idx3 = - ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; - - assert(x_filter_idx0 <= RS_SUBPEL_MASK); - assert(x_filter_idx1 <= RS_SUBPEL_MASK); - assert(x_filter_idx2 <= RS_SUBPEL_MASK); - assert(x_filter_idx3 <= RS_SUBPEL_MASK); - - const int16_t *const x_filter0 = - &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter1 = - &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter2 = - &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; - const int16_t *const x_filter3 = - &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; - - const __m128i fil0_16 = xx_loadu_128(x_filter0); - const __m128i fil1_16 = xx_loadu_128(x_filter1); - const __m128i fil2_16 = xx_loadu_128(x_filter2); - const __m128i fil3_16 = xx_loadu_128(x_filter3); - - src_y = src; - dst_y = dst; - for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { - const uint16_t *const src_x0 = - &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint16_t *const src_x1 = - &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint16_t *const src_x2 = - &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - const uint16_t *const src_x3 = - &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; - - // Load up the source data. This is 16-bit input data, so each load - // gets 8 pixels. - const __m128i src0_16 = xx_loadu_128(src_x0); - const __m128i src1_16 = xx_loadu_128(src_x1); - const __m128i src2_16 = xx_loadu_128(src_x2); - const __m128i src3_16 = xx_loadu_128(src_x3); - - // Multiply by filter coefficients (results in a 32-bit value), - // and add adjacent pairs, i.e. - // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) - // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] - const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); - const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); - const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); - const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); - - // Reduce horizontally and add, i.e. - // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] - const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); - const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); - - const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); - - // Divide down by (1 << FILTER_BITS), rounding to nearest. - const __m128i shifted_32 = - _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); - - // Pack 32-bit values into 16-bit values, i.e. - // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] - const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); - - // Clip the values at (1 << bd) - 1 - const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); - - // Write to the output - xx_storel_64(&dst_y[x], clipped_16); - } - } -} diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c deleted file mode 100644 index d9fb53785..000000000 --- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c +++ /dev/null @@ -1,499 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "av1/common/convolve.h" - -// A specialised version of hfilter, the horizontal filter for -// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. -static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, - int h, int subpel_x_qn, int x_step_qn, - const InterpFilterParams *filter_params, unsigned round) { - const int bd = 8; - const int ntaps = 8; - - src -= ntaps / 2 - 1; - - int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); - const __m128i round_add = _mm_set1_epi32(round_add32); - const __m128i round_shift = _mm_cvtsi32_si128(round); - - int x_qn = subpel_x_qn; - for (int x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); - const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(filter_idx < SUBPEL_SHIFTS); - const int16_t *filter = - av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); - - // Load the filter coefficients - const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); - const __m128i zero = _mm_castps_si128(_mm_setzero_ps()); - - int y; - for (y = 0; y <= h - 4; y += 4) { - const uint8_t *const src0 = src_col + y * src_stride; - const uint8_t *const src1 = src0 + 1 * src_stride; - const uint8_t *const src2 = src0 + 2 * src_stride; - const uint8_t *const src3 = src0 + 3 * src_stride; - - // Load up source data. This is 8-bit input data; each load is just - // loading the lower half of the register and gets 8 pixels - const __m128i data08 = _mm_loadl_epi64((__m128i *)src0); - const __m128i data18 = _mm_loadl_epi64((__m128i *)src1); - const __m128i data28 = _mm_loadl_epi64((__m128i *)src2); - const __m128i data38 = _mm_loadl_epi64((__m128i *)src3); - - // Now zero-extend up to 16-bit precision by interleaving with - // zeros. Drop the upper half of each register (which just had zeros) - const __m128i data0lo = _mm_unpacklo_epi8(data08, zero); - const __m128i data1lo = _mm_unpacklo_epi8(data18, zero); - const __m128i data2lo = _mm_unpacklo_epi8(data28, zero); - const __m128i data3lo = _mm_unpacklo_epi8(data38, zero); - - // Multiply by coefficients - const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); - const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); - const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); - const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); - - // Reduce horizontally and add - const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); - const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); - const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); - - // Divide down by (1 << round), rounding to nearest. - __m128i shifted = - _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); - - shifted = _mm_packus_epi32(shifted, shifted); - // Write transposed to the output - _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); - } - for (; y < h; ++y) { - const uint8_t *const src_row = src_col + y * src_stride; - - int32_t sum = (1 << (bd + FILTER_BITS - 1)); - for (int k = 0; k < ntaps; ++k) { - sum += filter[k] * src_row[k]; - } - - dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); - } - } -} - -static __m128i convolve_16_8(const int16_t *src, __m128i coeff) { - __m128i data = _mm_loadu_si128((__m128i *)src); - return _mm_madd_epi16(data, coeff); -} - -// A specialised version of vfilter, the vertical filter for -// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. -static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, int subpel_y_qn, - int y_step_qn, const InterpFilterParams *filter_params, - const ConvolveParams *conv_params, int bd) { - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int ntaps = 8; - - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - const __m128i sub = _mm_set1_epi16(sub32); - - CONV_BUF_TYPE *dst16 = conv_params->dst; - const int dst16_stride = conv_params->dst_stride; - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const __m128i bits_shift = _mm_cvtsi32_si128(bits); - const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1)); - const __m128i round_shift_add = - _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); - const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - - int y_qn = subpel_y_qn; - for (int y = 0; y < h; ++y, y_qn += y_step_qn) { - const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); - const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(filter_idx < SUBPEL_SHIFTS); - const int16_t *filter = - av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); - - const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); - int x; - for (x = 0; x <= w - 4; x += 4) { - const int16_t *const src0 = src_y + x * src_stride; - const int16_t *const src1 = src0 + 1 * src_stride; - const int16_t *const src2 = src0 + 2 * src_stride; - const int16_t *const src3 = src0 + 3 * src_stride; - - // Load the source data for the three rows, adding the three registers of - // convolved products to one as we go (conv0..conv3) to avoid the - // register pressure getting too high. - const __m128i conv0 = convolve_16_8(src0, coeff0716); - const __m128i conv1 = convolve_16_8(src1, coeff0716); - const __m128i conv2 = convolve_16_8(src2, coeff0716); - const __m128i conv3 = convolve_16_8(src3, coeff0716); - - // Now reduce horizontally to get one lane for each result - const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); - const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); - __m128i conv = _mm_hadd_epi32(conv01, conv23); - - conv = _mm_add_epi32(conv, res_add_const); - // Divide down by (1 << round_1), rounding to nearest and subtract sub32. - __m128i shifted = - _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); - - uint8_t *dst_x = dst + y * dst_stride + x; - CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; - __m128i result; - __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); - - if (conv_params->is_compound) { - if (conv_params->do_average) { - const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); - const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - shifted_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1); - } - const __m128i subbed = _mm_sub_epi16(shifted_16, sub); - result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); - const __m128i result_8 = _mm_packus_epi16(result, result); - *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8); - } else { - _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); - } - } else { - const __m128i subbed = _mm_sub_epi16(shifted_16, sub); - result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); - const __m128i result_8 = _mm_packus_epi16(result, result); - *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8); - } - } - for (; x < w; ++x) { - const int16_t *src_x = src_y + x * src_stride; - int32_t sum = 1 << offset_bits; - for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - - if (conv_params->is_compound) { - if (conv_params->do_average) { - int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { - tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; - tmp = tmp >> DIST_PRECISION_BITS; - } else { - tmp += res; - tmp = tmp >> 1; - } - /* Subtract round offset and convolve round */ - tmp = tmp - sub32; - dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); - } else { - dst16[y * dst16_stride + x] = res; - } - } else { - /* Subtract round offset and convolve round */ - int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); - } - } - } -} -void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, - uint8_t *dst8, int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int x_step_qn, - const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params) { - // TODO(yaowu): remove unnecessary initializations - int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 }; - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; - - const int xtaps = filter_params_x->taps; - const int ytaps = filter_params_y->taps; - const int fo_vert = ytaps / 2 - 1; - assert((xtaps == 8) && (ytaps == 8)); - (void)xtaps; - - // horizontal filter - hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, - x_step_qn, filter_params_x, conv_params->round_0); - - // vertical filter (input is transposed) - vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn, - filter_params_y, conv_params, 8); -} - -// A specialised version of hfilter, the horizontal filter for -// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap -// filters. -static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, - int w, int h, int subpel_x_qn, int x_step_qn, - const InterpFilterParams *filter_params, - unsigned round, int bd) { - const int ntaps = 8; - - src -= ntaps / 2 - 1; - - int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); - const __m128i round_add = _mm_set1_epi32(round_add32); - const __m128i round_shift = _mm_cvtsi32_si128(round); - - int x_qn = subpel_x_qn; - for (int x = 0; x < w; ++x, x_qn += x_step_qn) { - const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); - const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(filter_idx < SUBPEL_SHIFTS); - const int16_t *filter = - av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); - - // Load the filter coefficients - const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); - - int y; - for (y = 0; y <= h - 4; y += 4) { - const uint16_t *const src0 = src_col + y * src_stride; - const uint16_t *const src1 = src0 + 1 * src_stride; - const uint16_t *const src2 = src0 + 2 * src_stride; - const uint16_t *const src3 = src0 + 3 * src_stride; - - // Load up source data. This is 16-bit input data, so each load gets the 8 - // pixels we need. - const __m128i data0lo = _mm_loadu_si128((__m128i *)src0); - const __m128i data1lo = _mm_loadu_si128((__m128i *)src1); - const __m128i data2lo = _mm_loadu_si128((__m128i *)src2); - const __m128i data3lo = _mm_loadu_si128((__m128i *)src3); - - // Multiply by coefficients - const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); - const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); - const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); - const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); - - // Reduce horizontally and add - const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); - const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); - const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); - - // Divide down by (1 << round), rounding to nearest. - __m128i shifted = - _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); - - shifted = _mm_packus_epi32(shifted, shifted); - // Write transposed to the output - _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); - } - for (; y < h; ++y) { - const uint16_t *const src_row = src_col + y * src_stride; - - int32_t sum = (1 << (bd + FILTER_BITS - 1)); - for (int k = 0; k < ntaps; ++k) { - sum += filter[k] * src_row[k]; - } - - dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); - } - } -} -// A specialised version of vfilter, the vertical filter for -// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap -// filters. -static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, - int dst_stride, int w, int h, int subpel_y_qn, - int y_step_qn, - const InterpFilterParams *filter_params, - const ConvolveParams *conv_params, int bd) { - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int ntaps = 8; - - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - const __m128i sub = _mm_set1_epi32(sub32); - - CONV_BUF_TYPE *dst16 = conv_params->dst; - const int dst16_stride = conv_params->dst_stride; - const __m128i clip_pixel_ = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const __m128i bits_shift = _mm_cvtsi32_si128(bits); - const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1)); - const __m128i round_shift_add = - _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); - const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); - __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - - int y_qn = subpel_y_qn; - for (int y = 0; y < h; ++y, y_qn += y_step_qn) { - const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); - const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(filter_idx < SUBPEL_SHIFTS); - const int16_t *filter = - av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); - - const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); - int x; - for (x = 0; x <= w - 4; x += 4) { - const int16_t *const src0 = src_y + x * src_stride; - const int16_t *const src1 = src0 + 1 * src_stride; - const int16_t *const src2 = src0 + 2 * src_stride; - const int16_t *const src3 = src0 + 3 * src_stride; - - // Load the source data for the three rows, adding the three registers of - // convolved products to one as we go (conv0..conv3) to avoid the - // register pressure getting too high. - const __m128i conv0 = convolve_16_8(src0, coeff0716); - const __m128i conv1 = convolve_16_8(src1, coeff0716); - const __m128i conv2 = convolve_16_8(src2, coeff0716); - const __m128i conv3 = convolve_16_8(src3, coeff0716); - - // Now reduce horizontally to get one lane for each result - const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); - const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); - __m128i conv = _mm_hadd_epi32(conv01, conv23); - conv = _mm_add_epi32(conv, res_add_const); - - // Divide down by (1 << round_1), rounding to nearest and subtract sub32. - __m128i shifted = - _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); - - uint16_t *dst_x = dst + y * dst_stride + x; - CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; - - __m128i result; - if (conv_params->is_compound) { - if (conv_params->do_average) { - __m128i p_32 = - _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); - - if (conv_params->use_jnt_comp_avg) { - shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), - _mm_mullo_epi32(shifted, wt1)); - shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); - } else { - shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1); - } - __m128i res32 = _mm_sub_epi32(shifted, sub); - res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const), - round_bits_shift); - - __m128i res16 = _mm_packus_epi32(res32, res32); - res16 = _mm_min_epi16(res16, clip_pixel_); - _mm_storel_epi64((__m128i *)dst_x, res16); - } else { - __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); - _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); - } - } else { - const __m128i subbed = _mm_sub_epi32(shifted, sub); - result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift); - result = _mm_packus_epi32(result, result); - result = _mm_min_epi16(result, clip_pixel_); - _mm_storel_epi64((__m128i *)dst_x, result); - } - } - - for (; x < w; ++x) { - const int16_t *src_x = src_y + x * src_stride; - int32_t sum = 1 << offset_bits; - for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; - CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); - if (conv_params->is_compound) { - if (conv_params->do_average) { - int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { - tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; - tmp = tmp >> DIST_PRECISION_BITS; - } else { - tmp += res; - tmp = tmp >> 1; - } - /* Subtract round offset and convolve round */ - tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - dst[y * dst_stride + x] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); - } else { - dst16[y * dst16_stride + x] = res; - } - } else { - /* Subtract round offset and convolve round */ - int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1))); - dst[y * dst_stride + x] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); - } - } - } -} - -void av1_highbd_convolve_2d_scale_sse4_1( - const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_qn, - const int x_step_qn, const int subpel_y_qn, const int y_step_qn, - ConvolveParams *conv_params, int bd) { - // TODO(yaowu): Move this out of stack - DECLARE_ALIGNED(16, int16_t, - tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); - int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + - filter_params_y->taps; - const int xtaps = filter_params_x->taps; - const int ytaps = filter_params_y->taps; - const int fo_vert = ytaps / 2 - 1; - - memset(tmp, 0, sizeof(tmp)); - assert((xtaps == 8) && (ytaps == 8)); - (void)xtaps; - - // horizontal filter - highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, - subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0, - bd); - - // vertical filter (input is transposed) - highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn, - filter_params_y, conv_params, bd); -} diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c deleted file mode 100644 index 212d3bd72..000000000 --- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/filter.h" - -typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src, - int src_stride, uint16_t *dst, int dst_stride, - int bd); - -// pixelsNum 0: write all 4 pixels -// 1/2/3: residual pixels 1/2/3 -static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, - int dst_stride) { - if (2 == width) { - if (0 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); - *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]); - } else if (1 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - } else if (2 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - } else if (3 == pixelsNum) { - *(int *)dst = _mm_cvtsi128_si32(u[0]); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); - *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); - } - } else { - if (0 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); - _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]); - } else if (1 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - } else if (2 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - } else if (3 == pixelsNum) { - _mm_storel_epi64((__m128i *)dst, u[0]); - _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); - _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); - } - } -} - -// 16-bit pixels clip with bd (10/12) -static void highbd_clip(__m128i *p, int numVecs, int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - __m128i clamped, mask; - int i; - - for (i = 0; i < numVecs; i++) { - mask = _mm_cmpgt_epi16(p[i], max); - clamped = _mm_andnot_si128(mask, p[i]); - mask = _mm_and_si128(mask, max); - clamped = _mm_or_si128(mask, clamped); - mask = _mm_cmpgt_epi16(clamped, zero); - p[i] = _mm_and_si128(clamped, mask); - } -} - -static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { - __m128i v0, v1; - __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - - u[0] = _mm_loadu_si128((__m128i const *)src); - u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); - u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); - u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - - u[0] = _mm_add_epi32(u[0], rnd); - u[1] = _mm_add_epi32(u[1], rnd); - u[2] = _mm_add_epi32(u[2], rnd); - u[3] = _mm_add_epi32(u[3], rnd); - - u[0] = _mm_srai_epi32(u[0], FILTER_BITS); - u[1] = _mm_srai_epi32(u[1], FILTER_BITS); - u[2] = _mm_srai_epi32(u[2], FILTER_BITS); - u[3] = _mm_srai_epi32(u[3], FILTER_BITS); - - u[0] = _mm_packus_epi32(u[0], u[1]); - u[1] = _mm_packus_epi32(u[2], u[3]); - - highbd_clip(u, 2, bd); - - v0 = _mm_unpacklo_epi16(u[0], u[1]); - v1 = _mm_unpackhi_epi16(u[0], u[1]); - - u[0] = _mm_unpacklo_epi16(v0, v1); - u[2] = _mm_unpackhi_epi16(v0, v1); - - u[1] = _mm_srli_si128(u[0], 8); - u[3] = _mm_srli_si128(u[2], 8); -} - -// pixelsNum = 0 : all 4 rows of pixels will be saved. -// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved. -void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride, - uint16_t *dst, int dst_stride, int bd) { - __m128i u[4]; - transClipPixel(src, src_stride, u, bd); - writePixel(u, width, pixelsNum, dst, dst_stride); -} - -void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src, - int src_stride, uint16_t *dst, int dst_stride, - int bd) { - __m128i u[4], v[4]; - const __m128i ones = _mm_set1_epi16(1); - - transClipPixel(src, src_stride, u, bd); - - v[0] = _mm_loadl_epi64((__m128i const *)dst); - v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride)); - v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride)); - v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride)); - - u[0] = _mm_add_epi16(u[0], v[0]); - u[1] = _mm_add_epi16(u[1], v[1]); - u[2] = _mm_add_epi16(u[2], v[2]); - u[3] = _mm_add_epi16(u[3], v[3]); - - u[0] = _mm_add_epi16(u[0], ones); - u[1] = _mm_add_epi16(u[1], ones); - u[2] = _mm_add_epi16(u[2], ones); - u[3] = _mm_add_epi16(u[3], ones); - - u[0] = _mm_srai_epi16(u[0], 1); - u[1] = _mm_srai_epi16(u[1], 1); - u[2] = _mm_srai_epi16(u[2], 1); - u[3] = _mm_srai_epi16(u[3], 1); - - writePixel(u, width, pixelsNum, dst, dst_stride); -} - -// Vertical convolutional filter - -typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst); - -static void highbdRndingPacks(__m128i *u) { - __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - u[0] = _mm_add_epi32(u[0], rnd); - u[0] = _mm_srai_epi32(u[0], FILTER_BITS); - u[0] = _mm_packus_epi32(u[0], u[0]); -} - -static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) { - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]); -} - -static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { - __m128i v = _mm_loadl_epi64((__m128i const *)dst); - const __m128i ones = _mm_set1_epi16(1); - - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - - v = _mm_add_epi16(v, u[0]); - v = _mm_add_epi16(v, ones); - v = _mm_srai_epi16(v, 1); - *(uint32_t *)dst = _mm_cvtsi128_si32(v); -} - -WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum }; - -static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) { - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - _mm_storel_epi64((__m128i *)dst, u[0]); -} - -static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { - __m128i v = _mm_loadl_epi64((__m128i const *)dst); - const __m128i ones = _mm_set1_epi16(1); - - highbdRndingPacks(u); - highbd_clip(u, 1, bd); - - v = _mm_add_epi16(v, u[0]); - v = _mm_add_epi16(v, ones); - v = _mm_srai_epi16(v, 1); - _mm_storel_epi64((__m128i *)dst, v); -} - -WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum }; diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c deleted file mode 100644 index 5db2ccf6c..000000000 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c +++ /dev/null @@ -1,1945 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_config.h" - -#include "config/av1_rtcd.h" - -#include "av1/common/av1_inv_txfm1d_cfg.h" -#include "av1/common/x86/av1_txfm_sse2.h" -#include "av1/common/x86/av1_inv_txfm_avx2.h" -#include "av1/common/x86/av1_inv_txfm_ssse3.h" - -// TODO(venkatsanampudi@ittiam.com): move this to header file - -// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 -static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, - 4 * 5793 }; - -static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x1[0], &x1[3]); - btf_16_adds_subs_avx2(&x1[1], &x1[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); - - btf_16_adds_subs_avx2(&x1[8], &x1[11]); - btf_16_adds_subs_avx2(&x1[9], &x1[10]); - btf_16_adds_subs_avx2(&x1[15], &x1[12]); - btf_16_adds_subs_avx2(&x1[14], &x1[13]); -} - -static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x[0], &x[7]); - btf_16_adds_subs_avx2(&x[1], &x[6]); - btf_16_adds_subs_avx2(&x[2], &x[5]); - btf_16_adds_subs_avx2(&x[3], &x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); -} - -static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { - btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); - btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); - btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); - btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); - btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); - btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); - btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); - btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); -} - -static void idct16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); - __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); - __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); - __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); - __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); - __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); - __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); - __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); - __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); - __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); - __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); - __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); - __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); - __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); - __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); - __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - - // stage 1 - __m256i x1[16]; - x1[0] = input[0]; - x1[1] = input[8]; - x1[2] = input[4]; - x1[3] = input[12]; - x1[4] = input[2]; - x1[5] = input[10]; - x1[6] = input[6]; - x1[7] = input[14]; - x1[8] = input[1]; - x1[9] = input[9]; - x1[10] = input[5]; - x1[11] = input[13]; - x1[12] = input[3]; - x1[13] = input[11]; - x1[14] = input[7]; - x1[15] = input[15]; - - // stage 2 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); - - // stage 3 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); - btf_16_adds_subs_avx2(&x1[8], &x1[9]); - btf_16_adds_subs_avx2(&x1[11], &x1[10]); - btf_16_adds_subs_avx2(&x1[12], &x1[13]); - btf_16_adds_subs_avx2(&x1[15], &x1[14]); - - // stage 4 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); - btf_16_adds_subs_avx2(&x1[4], &x1[5]); - btf_16_adds_subs_avx2(&x1[7], &x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); - - idct16_stage5_avx2(x1, cospi, _r, cos_bit); - idct16_stage6_avx2(x1, cospi, _r, cos_bit); - idct16_stage7_avx2(output, x1); -} - -static void idct16_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - - // stage 1 - __m256i x1[16]; - x1[0] = input[0]; - x1[2] = input[4]; - x1[4] = input[2]; - x1[6] = input[6]; - x1[8] = input[1]; - x1[10] = input[5]; - x1[12] = input[3]; - x1[14] = input[7]; - - // stage 2 - btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); - btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); - btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); - btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); - - // stage 3 - btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); - btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); - btf_16_adds_subs_avx2(&x1[8], &x1[9]); - btf_16_adds_subs_avx2(&x1[11], &x1[10]); - btf_16_adds_subs_avx2(&x1[12], &x1[13]); - btf_16_adds_subs_avx2(&x1[15], &x1[14]); - - // stage 4 - btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); - btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); - btf_16_adds_subs_avx2(&x1[4], &x1[5]); - btf_16_adds_subs_avx2(&x1[7], &x1[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); - - idct16_stage5_avx2(x1, cospi, _r, cos_bit); - idct16_stage6_avx2(x1, cospi, _r, cos_bit); - idct16_stage7_avx2(output, x1); -} - -static void idct16_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m256i x1[2]; - x1[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); - - // stage 5 - // stage 6 - output[0] = x1[0]; - output[1] = x1[1]; - output[2] = x1[1]; - output[3] = x1[0]; - output[4] = x1[0]; - output[5] = x1[1]; - output[6] = x1[1]; - output[7] = x1[0]; - output[8] = x1[0]; - output[9] = x1[1]; - output[10] = x1[1]; - output[11] = x1[0]; - output[12] = x1[0]; - output[13] = x1[1]; - output[14] = x1[1]; - output[15] = x1[0]; -} - -static INLINE void iadst16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(&x[0], &x[8]); - btf_16_adds_subs_avx2(&x[1], &x[9]); - btf_16_adds_subs_avx2(&x[2], &x[10]); - btf_16_adds_subs_avx2(&x[3], &x[11]); - btf_16_adds_subs_avx2(&x[4], &x[12]); - btf_16_adds_subs_avx2(&x[5], &x[13]); - btf_16_adds_subs_avx2(&x[6], &x[14]); - btf_16_adds_subs_avx2(&x[7], &x[15]); -} - -static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); - const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); - const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); - const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); - const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); - const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); - btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); - btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); -} - -static INLINE void iadst16_stage5_avx2(__m256i *x) { - btf_16_adds_subs_avx2(&x[0], &x[4]); - btf_16_adds_subs_avx2(&x[1], &x[5]); - btf_16_adds_subs_avx2(&x[2], &x[6]); - btf_16_adds_subs_avx2(&x[3], &x[7]); - btf_16_adds_subs_avx2(&x[8], &x[12]); - btf_16_adds_subs_avx2(&x[9], &x[13]); - btf_16_adds_subs_avx2(&x[10], &x[14]); - btf_16_adds_subs_avx2(&x[11], &x[15]); -} - -static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); - const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); - const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); -} - -static INLINE void iadst16_stage7_avx2(__m256i *x) { - btf_16_adds_subs_avx2(&x[0], &x[2]); - btf_16_adds_subs_avx2(&x[1], &x[3]); - btf_16_adds_subs_avx2(&x[4], &x[6]); - btf_16_adds_subs_avx2(&x[5], &x[7]); - btf_16_adds_subs_avx2(&x[8], &x[10]); - btf_16_adds_subs_avx2(&x[9], &x[11]); - btf_16_adds_subs_avx2(&x[12], &x[14]); - btf_16_adds_subs_avx2(&x[13], &x[15]); -} - -static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); -} - -static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { - const __m256i __zero = _mm256_setzero_si256(); - output[0] = x1[0]; - output[1] = _mm256_subs_epi16(__zero, x1[8]); - output[2] = x1[12]; - output[3] = _mm256_subs_epi16(__zero, x1[4]); - output[4] = x1[6]; - output[5] = _mm256_subs_epi16(__zero, x1[14]); - output[6] = x1[10]; - output[7] = _mm256_subs_epi16(__zero, x1[2]); - output[8] = x1[3]; - output[9] = _mm256_subs_epi16(__zero, x1[11]); - output[10] = x1[15]; - output[11] = _mm256_subs_epi16(__zero, x1[7]); - output[12] = x1[5]; - output[13] = _mm256_subs_epi16(__zero, x1[13]); - output[14] = x1[9]; - output[15] = _mm256_subs_epi16(__zero, x1[1]); -} - -static void iadst16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); - __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); - __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); - __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); - __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); - __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); - __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); - __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); - __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); - __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); - __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); - __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); - __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); - __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); - __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); - __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); - - // stage 1 - __m256i x1[16]; - x1[0] = input[15]; - x1[1] = input[0]; - x1[2] = input[13]; - x1[3] = input[2]; - x1[4] = input[11]; - x1[5] = input[4]; - x1[6] = input[9]; - x1[7] = input[6]; - x1[8] = input[7]; - x1[9] = input[8]; - x1[10] = input[5]; - x1[11] = input[10]; - x1[12] = input[3]; - x1[13] = input[12]; - x1[14] = input[1]; - x1[15] = input[14]; - - // stage 2 - btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); - btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); - btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); - btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); - btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); - btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); - btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); - btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); - - iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, _r, cos_bit); - iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, _r, cos_bit); - iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, _r, cos_bit); - iadst16_stage9_avx2(output, x1); -} - -static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m256i x1[16]; - x1[1] = input[0]; - x1[3] = input[2]; - x1[5] = input[4]; - x1[7] = input[6]; - x1[8] = input[7]; - x1[10] = input[5]; - x1[12] = input[3]; - x1[14] = input[1]; - - // stage 2 - btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); - btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); - btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); - btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); - btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); - btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); - btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); - btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); - - iadst16_stage3_avx2(x1); - iadst16_stage4_avx2(x1, cospi, _r, cos_bit); - iadst16_stage5_avx2(x1); - iadst16_stage6_avx2(x1, cospi, _r, cos_bit); - iadst16_stage7_avx2(x1); - iadst16_stage8_avx2(x1, cospi, _r, cos_bit); - iadst16_stage9_avx2(output, x1); -} - -static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); - const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); - const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); - const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); - - // stage 1 - __m256i x1[16]; - x1[1] = input[0]; - - // stage 2 - btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); - - // stage 3 - x1[8] = x1[0]; - x1[9] = x1[1]; - - // stage 4 - btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); - - // stage 5 - x1[4] = x1[0]; - x1[5] = x1[1]; - - x1[12] = x1[8]; - x1[13] = x1[9]; - - // stage 6 - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); - btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); - - // stage 7 - x1[2] = x1[0]; - x1[3] = x1[1]; - x1[6] = x1[4]; - x1[7] = x1[5]; - x1[10] = x1[8]; - x1[11] = x1[9]; - x1[14] = x1[12]; - x1[15] = x1[13]; - - iadst16_stage8_avx2(x1, cospi, _r, cos_bit); - iadst16_stage9_avx2(output, x1); -} - -static INLINE void idct32_high16_stage3_avx2(__m256i *x) { - btf_16_adds_subs_avx2(&x[16], &x[17]); - btf_16_adds_subs_avx2(&x[19], &x[18]); - btf_16_adds_subs_avx2(&x[20], &x[21]); - btf_16_adds_subs_avx2(&x[23], &x[22]); - btf_16_adds_subs_avx2(&x[24], &x[25]); - btf_16_adds_subs_avx2(&x[27], &x[26]); - btf_16_adds_subs_avx2(&x[28], &x[29]); - btf_16_adds_subs_avx2(&x[31], &x[30]); -} - -static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); - const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); - const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); - const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); - const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); - const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); -} - -static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); - btf_16_adds_subs_avx2(&x[16], &x[19]); - btf_16_adds_subs_avx2(&x[17], &x[18]); - btf_16_adds_subs_avx2(&x[23], &x[20]); - btf_16_adds_subs_avx2(&x[22], &x[21]); - btf_16_adds_subs_avx2(&x[24], &x[27]); - btf_16_adds_subs_avx2(&x[25], &x[26]); - btf_16_adds_subs_avx2(&x[31], &x[28]); - btf_16_adds_subs_avx2(&x[30], &x[29]); -} - -static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); - btf_16_adds_subs_avx2(&x[8], &x[11]); - btf_16_adds_subs_avx2(&x[9], &x[10]); - btf_16_adds_subs_avx2(&x[15], &x[12]); - btf_16_adds_subs_avx2(&x[14], &x[13]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); -} - -static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x[0], &x[7]); - btf_16_adds_subs_avx2(&x[1], &x[6]); - btf_16_adds_subs_avx2(&x[2], &x[5]); - btf_16_adds_subs_avx2(&x[3], &x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); - btf_16_adds_subs_avx2(&x[16], &x[23]); - btf_16_adds_subs_avx2(&x[17], &x[22]); - btf_16_adds_subs_avx2(&x[18], &x[21]); - btf_16_adds_subs_avx2(&x[19], &x[20]); - btf_16_adds_subs_avx2(&x[31], &x[24]); - btf_16_adds_subs_avx2(&x[30], &x[25]); - btf_16_adds_subs_avx2(&x[29], &x[26]); - btf_16_adds_subs_avx2(&x[28], &x[27]); -} - -static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x[0], &x[15]); - btf_16_adds_subs_avx2(&x[1], &x[14]); - btf_16_adds_subs_avx2(&x[2], &x[13]); - btf_16_adds_subs_avx2(&x[3], &x[12]); - btf_16_adds_subs_avx2(&x[4], &x[11]); - btf_16_adds_subs_avx2(&x[5], &x[10]); - btf_16_adds_subs_avx2(&x[6], &x[9]); - btf_16_adds_subs_avx2(&x[7], &x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); -} - -static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); - btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); - btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); - btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); - btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); - btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); - btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); - btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); - btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); - btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); - btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); - btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); - btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); - btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); - btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); - btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); -} - -static void idct32_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m256i x[2]; - x[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - // stage 5 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 6 - // stage 7 - // stage 8 - // stage 9 - output[0] = x[0]; - output[31] = x[0]; - output[1] = x[1]; - output[30] = x[1]; - output[2] = x[1]; - output[29] = x[1]; - output[3] = x[0]; - output[28] = x[0]; - output[4] = x[0]; - output[27] = x[0]; - output[5] = x[1]; - output[26] = x[1]; - output[6] = x[1]; - output[25] = x[1]; - output[7] = x[0]; - output[24] = x[0]; - output[8] = x[0]; - output[23] = x[0]; - output[9] = x[1]; - output[22] = x[1]; - output[10] = x[1]; - output[21] = x[1]; - output[11] = x[0]; - output[20] = x[0]; - output[12] = x[0]; - output[19] = x[0]; - output[13] = x[1]; - output[18] = x[1]; - output[14] = x[1]; - output[17] = x[1]; - output[15] = x[0]; - output[16] = x[0]; -} - -static void idct32_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m256i x[32]; - x[0] = input[0]; - x[4] = input[4]; - x[8] = input[2]; - x[12] = input[6]; - x[16] = input[1]; - x[20] = input[5]; - x[24] = input[3]; - x[28] = input[7]; - - // stage 2 - btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - - // stage 3 - btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - x[17] = x[16]; - x[18] = x[19]; - x[21] = x[20]; - x[22] = x[23]; - x[25] = x[24]; - x[26] = x[27]; - x[29] = x[28]; - x[30] = x[31]; - - // stage 4 - btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); - x[9] = x[8]; - x[10] = x[11]; - x[13] = x[12]; - x[14] = x[15]; - idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); - - // stage 5 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - x[5] = x[4]; - x[6] = x[7]; - idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); - // stage 6 - x[3] = x[0]; - x[2] = x[1]; - idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - - idct32_stage7_avx2(x, cospi, _r, cos_bit); - idct32_stage8_avx2(x, cospi, _r, cos_bit); - idct32_stage9_avx2(output, x); -} - -static void idct32_low16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m256i x[32]; - x[0] = input[0]; - x[2] = input[8]; - x[4] = input[4]; - x[6] = input[12]; - x[8] = input[2]; - x[10] = input[10]; - x[12] = input[6]; - x[14] = input[14]; - x[16] = input[1]; - x[18] = input[9]; - x[20] = input[5]; - x[22] = input[13]; - x[24] = input[3]; - x[26] = input[11]; - x[28] = input[7]; - x[30] = input[15]; - - // stage 2 - btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); - btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); - btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); - btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); - btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - - // stage 3 - btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); - btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); - btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - idct32_high16_stage3_avx2(x); - - // stage 4 - btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); - btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(&x[8], &x[9]); - btf_16_adds_subs_avx2(&x[11], &x[10]); - btf_16_adds_subs_avx2(&x[12], &x[13]); - btf_16_adds_subs_avx2(&x[15], &x[14]); - idct32_high16_stage4_avx2(x, cospi, _r, cos_bit); - - // stage 5 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(&x[4], &x[5]); - btf_16_adds_subs_avx2(&x[7], &x[6]); - idct32_high24_stage5_avx2(x, cospi, _r, cos_bit); - - btf_16_adds_subs_avx2(&x[0], &x[3]); - btf_16_adds_subs_avx2(&x[1], &x[2]); - idct32_high28_stage6_avx2(x, cospi, _r, cos_bit); - - idct32_stage7_avx2(x, cospi, _r, cos_bit); - idct32_stage8_avx2(x, cospi, _r, cos_bit); - idct32_stage9_avx2(output, x); -} - -static void idct32_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)(cos_bit); - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); - __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); - __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); - __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); - __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); - __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); - __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); - __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); - __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); - __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); - __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); - __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); - __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); - __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); - __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); - __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); - __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); - __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); - __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); - __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); - __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); - __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); - __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); - __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); - __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); - __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); - __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); - __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); - __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); - __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); - __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); - - // stage 1 - __m256i x1[32]; - x1[0] = input[0]; - x1[1] = input[16]; - x1[2] = input[8]; - x1[3] = input[24]; - x1[4] = input[4]; - x1[5] = input[20]; - x1[6] = input[12]; - x1[7] = input[28]; - x1[8] = input[2]; - x1[9] = input[18]; - x1[10] = input[10]; - x1[11] = input[26]; - x1[12] = input[6]; - x1[13] = input[22]; - x1[14] = input[14]; - x1[15] = input[30]; - x1[16] = input[1]; - x1[17] = input[17]; - x1[18] = input[9]; - x1[19] = input[25]; - x1[20] = input[5]; - x1[21] = input[21]; - x1[22] = input[13]; - x1[23] = input[29]; - x1[24] = input[3]; - x1[25] = input[19]; - x1[26] = input[11]; - x1[27] = input[27]; - x1[28] = input[7]; - x1[29] = input[23]; - x1[30] = input[15]; - x1[31] = input[31]; - - // stage 2 - btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit); - btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit); - btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit); - btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit); - btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit); - btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit); - btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit); - btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit); - - // stage 3 - btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit); - btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit); - btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit); - btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit); - idct32_high16_stage3_avx2(x1); - - // stage 4 - btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit); - btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit); - btf_16_adds_subs_avx2(&x1[8], &x1[9]); - btf_16_adds_subs_avx2(&x1[11], &x1[10]); - btf_16_adds_subs_avx2(&x1[12], &x1[13]); - btf_16_adds_subs_avx2(&x1[15], &x1[14]); - idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit); - - // stage 5 - btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); - btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit); - btf_16_adds_subs_avx2(&x1[4], &x1[5]); - btf_16_adds_subs_avx2(&x1[7], &x1[6]); - idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit); - - // stage 6 - btf_16_adds_subs_avx2(&x1[0], &x1[3]); - btf_16_adds_subs_avx2(&x1[1], &x1[2]); - idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit); - - idct32_stage7_avx2(x1, cospi, _r, cos_bit); - idct32_stage8_avx2(x1, cospi, _r, cos_bit); - idct32_stage9_avx2(output, x1); -} - -static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); - const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); - const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); - const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); - const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); - const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); - const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); - const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); - const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); - const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); - const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); - const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); - btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); - btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); - btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); - btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); -} - -static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); - const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); - const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); - const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); - const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); - const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); - btf_16_adds_subs_avx2(&x[32], &x[35]); - btf_16_adds_subs_avx2(&x[33], &x[34]); - btf_16_adds_subs_avx2(&x[39], &x[36]); - btf_16_adds_subs_avx2(&x[38], &x[37]); - btf_16_adds_subs_avx2(&x[40], &x[43]); - btf_16_adds_subs_avx2(&x[41], &x[42]); - btf_16_adds_subs_avx2(&x[47], &x[44]); - btf_16_adds_subs_avx2(&x[46], &x[45]); - btf_16_adds_subs_avx2(&x[48], &x[51]); - btf_16_adds_subs_avx2(&x[49], &x[50]); - btf_16_adds_subs_avx2(&x[55], &x[52]); - btf_16_adds_subs_avx2(&x[54], &x[53]); - btf_16_adds_subs_avx2(&x[56], &x[59]); - btf_16_adds_subs_avx2(&x[57], &x[58]); - btf_16_adds_subs_avx2(&x[63], &x[60]); - btf_16_adds_subs_avx2(&x[62], &x[61]); -} - -static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); - const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); - const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); - const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); - const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); - const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); - btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); - btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); -} - -static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - btf_16_adds_subs_avx2(&x[16], &x[19]); - btf_16_adds_subs_avx2(&x[17], &x[18]); - btf_16_adds_subs_avx2(&x[23], &x[20]); - btf_16_adds_subs_avx2(&x[22], &x[21]); - btf_16_adds_subs_avx2(&x[24], &x[27]); - btf_16_adds_subs_avx2(&x[25], &x[26]); - btf_16_adds_subs_avx2(&x[31], &x[28]); - btf_16_adds_subs_avx2(&x[30], &x[29]); - idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); -} - -static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); - btf_16_adds_subs_avx2(&x[32], &x[39]); - btf_16_adds_subs_avx2(&x[33], &x[38]); - btf_16_adds_subs_avx2(&x[34], &x[37]); - btf_16_adds_subs_avx2(&x[35], &x[36]); - btf_16_adds_subs_avx2(&x[47], &x[40]); - btf_16_adds_subs_avx2(&x[46], &x[41]); - btf_16_adds_subs_avx2(&x[45], &x[42]); - btf_16_adds_subs_avx2(&x[44], &x[43]); - btf_16_adds_subs_avx2(&x[48], &x[55]); - btf_16_adds_subs_avx2(&x[49], &x[54]); - btf_16_adds_subs_avx2(&x[50], &x[53]); - btf_16_adds_subs_avx2(&x[51], &x[52]); - btf_16_adds_subs_avx2(&x[63], &x[56]); - btf_16_adds_subs_avx2(&x[62], &x[57]); - btf_16_adds_subs_avx2(&x[61], &x[58]); - btf_16_adds_subs_avx2(&x[60], &x[59]); -} - -static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - btf_16_adds_subs_avx2(&x[16], &x[23]); - btf_16_adds_subs_avx2(&x[17], &x[22]); - btf_16_adds_subs_avx2(&x[18], &x[21]); - btf_16_adds_subs_avx2(&x[19], &x[20]); - btf_16_adds_subs_avx2(&x[31], &x[24]); - btf_16_adds_subs_avx2(&x[30], &x[25]); - btf_16_adds_subs_avx2(&x[29], &x[26]); - btf_16_adds_subs_avx2(&x[28], &x[27]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); -} - -static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x[0], &x[15]); - btf_16_adds_subs_avx2(&x[1], &x[14]); - btf_16_adds_subs_avx2(&x[2], &x[13]); - btf_16_adds_subs_avx2(&x[3], &x[12]); - btf_16_adds_subs_avx2(&x[4], &x[11]); - btf_16_adds_subs_avx2(&x[5], &x[10]); - btf_16_adds_subs_avx2(&x[6], &x[9]); - btf_16_adds_subs_avx2(&x[7], &x[8]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); - btf_16_adds_subs_avx2(&x[32], &x[47]); - btf_16_adds_subs_avx2(&x[33], &x[46]); - btf_16_adds_subs_avx2(&x[34], &x[45]); - btf_16_adds_subs_avx2(&x[35], &x[44]); - btf_16_adds_subs_avx2(&x[36], &x[43]); - btf_16_adds_subs_avx2(&x[37], &x[42]); - btf_16_adds_subs_avx2(&x[38], &x[41]); - btf_16_adds_subs_avx2(&x[39], &x[40]); - btf_16_adds_subs_avx2(&x[63], &x[48]); - btf_16_adds_subs_avx2(&x[62], &x[49]); - btf_16_adds_subs_avx2(&x[61], &x[50]); - btf_16_adds_subs_avx2(&x[60], &x[51]); - btf_16_adds_subs_avx2(&x[59], &x[52]); - btf_16_adds_subs_avx2(&x[58], &x[53]); - btf_16_adds_subs_avx2(&x[57], &x[54]); - btf_16_adds_subs_avx2(&x[56], &x[55]); -} - -static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, - const __m256i _r, int8_t cos_bit) { - (void)cos_bit; - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_avx2(&x[0], &x[31]); - btf_16_adds_subs_avx2(&x[1], &x[30]); - btf_16_adds_subs_avx2(&x[2], &x[29]); - btf_16_adds_subs_avx2(&x[3], &x[28]); - btf_16_adds_subs_avx2(&x[4], &x[27]); - btf_16_adds_subs_avx2(&x[5], &x[26]); - btf_16_adds_subs_avx2(&x[6], &x[25]); - btf_16_adds_subs_avx2(&x[7], &x[24]); - btf_16_adds_subs_avx2(&x[8], &x[23]); - btf_16_adds_subs_avx2(&x[9], &x[22]); - btf_16_adds_subs_avx2(&x[10], &x[21]); - btf_16_adds_subs_avx2(&x[11], &x[20]); - btf_16_adds_subs_avx2(&x[12], &x[19]); - btf_16_adds_subs_avx2(&x[13], &x[18]); - btf_16_adds_subs_avx2(&x[14], &x[17]); - btf_16_adds_subs_avx2(&x[15], &x[16]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); -} - -static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { - btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); - btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); - btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); - btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); - btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); - btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); - btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); - btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); - btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); - btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); - btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); - btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); - btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); - btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); - btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); - btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); - btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); - btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); - btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); - btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); - btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); - btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); - btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); - btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); - btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); - btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); - btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); - btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); - btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); - btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); - btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); - btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); -} - -static void idct64_low1_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m256i x[32]; - x[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - // stage 5 - // stage 6 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 7 - // stage 8 - // stage 9 - // stage 10 - // stage 11 - output[0] = x[0]; - output[63] = x[0]; - output[1] = x[1]; - output[62] = x[1]; - output[2] = x[1]; - output[61] = x[1]; - output[3] = x[0]; - output[60] = x[0]; - output[4] = x[0]; - output[59] = x[0]; - output[5] = x[1]; - output[58] = x[1]; - output[6] = x[1]; - output[57] = x[1]; - output[7] = x[0]; - output[56] = x[0]; - output[8] = x[0]; - output[55] = x[0]; - output[9] = x[1]; - output[54] = x[1]; - output[10] = x[1]; - output[53] = x[1]; - output[11] = x[0]; - output[52] = x[0]; - output[12] = x[0]; - output[51] = x[0]; - output[13] = x[1]; - output[50] = x[1]; - output[14] = x[1]; - output[49] = x[1]; - output[15] = x[0]; - output[48] = x[0]; - output[16] = x[0]; - output[47] = x[0]; - output[17] = x[1]; - output[46] = x[1]; - output[18] = x[1]; - output[45] = x[1]; - output[19] = x[0]; - output[44] = x[0]; - output[20] = x[0]; - output[43] = x[0]; - output[21] = x[1]; - output[42] = x[1]; - output[22] = x[1]; - output[41] = x[1]; - output[23] = x[0]; - output[40] = x[0]; - output[24] = x[0]; - output[39] = x[0]; - output[25] = x[1]; - output[38] = x[1]; - output[26] = x[1]; - output[37] = x[1]; - output[27] = x[0]; - output[36] = x[0]; - output[28] = x[0]; - output[35] = x[0]; - output[29] = x[1]; - output[34] = x[1]; - output[30] = x[1]; - output[33] = x[1]; - output[31] = x[0]; - output[32] = x[0]; -} - -static void idct64_low8_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); - const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); - const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); - const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); - const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); - const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); - const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); - const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); - const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); - const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); - const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); - const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m256i x[64]; - x[0] = input[0]; - x[8] = input[4]; - x[16] = input[2]; - x[24] = input[6]; - x[32] = input[1]; - x[40] = input[5]; - x[48] = input[3]; - x[56] = input[7]; - - // stage 2 - btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - x[33] = x[32]; - x[38] = x[39]; - x[41] = x[40]; - x[46] = x[47]; - x[49] = x[48]; - x[54] = x[55]; - x[57] = x[56]; - x[62] = x[63]; - - // stage 4 - btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); - x[17] = x[16]; - x[22] = x[23]; - x[25] = x[24]; - x[30] = x[31]; - btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); - btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); - btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); - btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); - - // stage 5 - x[9] = x[8]; - x[14] = x[15]; - btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); - btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); - x[35] = x[32]; - x[34] = x[33]; - x[36] = x[39]; - x[37] = x[38]; - x[43] = x[40]; - x[42] = x[41]; - x[44] = x[47]; - x[45] = x[46]; - x[51] = x[48]; - x[50] = x[49]; - x[52] = x[55]; - x[53] = x[54]; - x[59] = x[56]; - x[58] = x[57]; - x[60] = x[63]; - x[61] = x[62]; - - // stage 6 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); - x[19] = x[16]; - x[18] = x[17]; - x[20] = x[23]; - x[21] = x[22]; - x[27] = x[24]; - x[26] = x[25]; - x[28] = x[31]; - x[29] = x[30]; - idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); - - // stage 7 - x[3] = x[0]; - x[2] = x[1]; - x[11] = x[8]; - x[10] = x[9]; - x[12] = x[15]; - x[13] = x[14]; - idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); - - // stage 8 - x[7] = x[0]; - x[6] = x[1]; - x[5] = x[2]; - x[4] = x[3]; - x[9] = x[9]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); - idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); - - idct64_stage9_avx2(x, cospi, _r, cos_bit); - idct64_stage10_avx2(x, cospi, _r, cos_bit); - idct64_stage11_avx2(output, x); -} - -static void idct64_low16_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m256i x[64]; - x[0] = input[0]; - x[4] = input[8]; - x[8] = input[4]; - x[12] = input[12]; - x[16] = input[2]; - x[20] = input[10]; - x[24] = input[6]; - x[28] = input[14]; - x[32] = input[1]; - x[36] = input[9]; - x[40] = input[5]; - x[44] = input[13]; - x[48] = input[3]; - x[52] = input[11]; - x[56] = input[7]; - x[60] = input[15]; - - // stage 2 - btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); - btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); - btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); - btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); - btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - x[33] = x[32]; - x[34] = x[35]; - x[37] = x[36]; - x[38] = x[39]; - x[41] = x[40]; - x[42] = x[43]; - x[45] = x[44]; - x[46] = x[47]; - x[49] = x[48]; - x[50] = x[51]; - x[53] = x[52]; - x[54] = x[55]; - x[57] = x[56]; - x[58] = x[59]; - x[61] = x[60]; - x[62] = x[63]; - - // stage 4 - btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - x[17] = x[16]; - x[18] = x[19]; - x[21] = x[20]; - x[22] = x[23]; - x[25] = x[24]; - x[26] = x[27]; - x[29] = x[28]; - x[30] = x[31]; - idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); - - // stage 5 - btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); - x[9] = x[8]; - x[10] = x[11]; - x[13] = x[12]; - x[14] = x[15]; - idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); - - // stage 6 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - x[5] = x[4]; - x[6] = x[7]; - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); - idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); - - // stage 7 - x[3] = x[0]; - x[2] = x[1]; - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); - btf_16_adds_subs_avx2(&x[8], &x[11]); - btf_16_adds_subs_avx2(&x[9], &x[10]); - btf_16_adds_subs_avx2(&x[15], &x[12]); - btf_16_adds_subs_avx2(&x[14], &x[13]); - idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); - - // stage 8 - btf_16_adds_subs_avx2(&x[0], &x[7]); - btf_16_adds_subs_avx2(&x[1], &x[6]); - btf_16_adds_subs_avx2(&x[2], &x[5]); - btf_16_adds_subs_avx2(&x[3], &x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); - idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); - - idct64_stage9_avx2(x, cospi, _r, cos_bit); - idct64_stage10_avx2(x, cospi, _r, cos_bit); - idct64_stage11_avx2(output, x); -} - -static void idct64_low32_new_avx2(const __m256i *input, __m256i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); - const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); - const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); - const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); - const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m256i x[64]; - x[0] = input[0]; - x[2] = input[16]; - x[4] = input[8]; - x[6] = input[24]; - x[8] = input[4]; - x[10] = input[20]; - x[12] = input[12]; - x[14] = input[28]; - x[16] = input[2]; - x[18] = input[18]; - x[20] = input[10]; - x[22] = input[26]; - x[24] = input[6]; - x[26] = input[22]; - x[28] = input[14]; - x[30] = input[30]; - x[32] = input[1]; - x[34] = input[17]; - x[36] = input[9]; - x[38] = input[25]; - x[40] = input[5]; - x[42] = input[21]; - x[44] = input[13]; - x[46] = input[29]; - x[48] = input[3]; - x[50] = input[19]; - x[52] = input[11]; - x[54] = input[27]; - x[56] = input[7]; - x[58] = input[23]; - x[60] = input[15]; - x[62] = input[31]; - - // stage 2 - btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); - btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); - btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); - btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); - btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); - btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); - btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); - btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); - btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); - btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); - btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); - btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); - btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); - btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); - btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); - btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); - btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); - btf_16_adds_subs_avx2(&x[32], &x[33]); - btf_16_adds_subs_avx2(&x[35], &x[34]); - btf_16_adds_subs_avx2(&x[36], &x[37]); - btf_16_adds_subs_avx2(&x[39], &x[38]); - btf_16_adds_subs_avx2(&x[40], &x[41]); - btf_16_adds_subs_avx2(&x[43], &x[42]); - btf_16_adds_subs_avx2(&x[44], &x[45]); - btf_16_adds_subs_avx2(&x[47], &x[46]); - btf_16_adds_subs_avx2(&x[48], &x[49]); - btf_16_adds_subs_avx2(&x[51], &x[50]); - btf_16_adds_subs_avx2(&x[52], &x[53]); - btf_16_adds_subs_avx2(&x[55], &x[54]); - btf_16_adds_subs_avx2(&x[56], &x[57]); - btf_16_adds_subs_avx2(&x[59], &x[58]); - btf_16_adds_subs_avx2(&x[60], &x[61]); - btf_16_adds_subs_avx2(&x[63], &x[62]); - - // stage 4 - btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); - btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); - btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); - btf_16_adds_subs_avx2(&x[16], &x[17]); - btf_16_adds_subs_avx2(&x[19], &x[18]); - btf_16_adds_subs_avx2(&x[20], &x[21]); - btf_16_adds_subs_avx2(&x[23], &x[22]); - btf_16_adds_subs_avx2(&x[24], &x[25]); - btf_16_adds_subs_avx2(&x[27], &x[26]); - btf_16_adds_subs_avx2(&x[28], &x[29]); - btf_16_adds_subs_avx2(&x[31], &x[30]); - idct64_stage4_high32_avx2(x, cospi, _r, cos_bit); - - // stage 5 - btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); - btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_avx2(&x[8], &x[9]); - btf_16_adds_subs_avx2(&x[11], &x[10]); - btf_16_adds_subs_avx2(&x[12], &x[13]); - btf_16_adds_subs_avx2(&x[15], &x[14]); - idct64_stage5_high48_avx2(x, cospi, _r, cos_bit); - - // stage 6 - btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_avx2(&x[4], &x[5]); - btf_16_adds_subs_avx2(&x[7], &x[6]); - btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); - btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); - idct64_stage6_high48_avx2(x, cospi, _r, cos_bit); - - // stage 7 - btf_16_adds_subs_avx2(&x[0], &x[3]); - btf_16_adds_subs_avx2(&x[1], &x[2]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); - btf_16_adds_subs_avx2(&x[8], &x[11]); - btf_16_adds_subs_avx2(&x[9], &x[10]); - btf_16_adds_subs_avx2(&x[15], &x[12]); - btf_16_adds_subs_avx2(&x[14], &x[13]); - idct64_stage7_high48_avx2(x, cospi, _r, cos_bit); - - // stage 8 - btf_16_adds_subs_avx2(&x[0], &x[7]); - btf_16_adds_subs_avx2(&x[1], &x[6]); - btf_16_adds_subs_avx2(&x[2], &x[5]); - btf_16_adds_subs_avx2(&x[3], &x[4]); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); - btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); - idct64_stage8_high48_avx2(x, cospi, _r, cos_bit); - - // stage 9~11 - idct64_stage9_avx2(x, cospi, _r, cos_bit); - idct64_stage10_avx2(x, cospi, _r, cos_bit); - idct64_stage11_avx2(output, x); -} - -// 1D functions process 16 pixels at one time. -static const transform_1d_avx2 - lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { - { - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { - { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL }, - { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2, - NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2, - idct32_new_avx2 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2, - idct64_low32_new_avx2 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } } - }; - -// only process w >= 16 h >= 16 -static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - __m256i buf1[64 * 16]; - int eobx, eoby; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div16 = txfm_size_col >> 4; - const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4; - const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_avx2 row_txfm = - lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_avx2 col_txfm = - lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_nonzero_h_div16; i++) { - __m256i buf0[64]; - const int32_t *input_row = input + (i << 4) * input_stride; - for (int j = 0; j < buf_size_nonzero_w_div16; ++j) { - __m256i *buf0_cur = buf0 + j * 16; - const int32_t *input_cur = input_row + j * 16; - load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur, - 16); - transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); - } - if (rect_type == 1 || rect_type == -1) { - round_shift_avx2(buf0, buf0, input_stride); // rect special code - } - row_txfm(buf0, buf0, cos_bit_row); - round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); - - __m256i *buf1_cur = buf1 + (i << 4); - if (lr_flip) { - for (int j = 0; j < buf_size_w_div16; ++j) { - __m256i temp[16]; - flip_buf_avx2(buf0 + 16 * j, temp, 16); - int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); - transpose_16bit_16x16_avx2(temp, buf1_cur + offset); - } - } else { - for (int j = 0; j < buf_size_w_div16; ++j) { - transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); - } - } - } - for (int i = 0; i < buf_size_w_div16; i++) { - __m256i *buf1_cur = buf1 + i * txfm_size_row; - col_txfm(buf1_cur, buf1_cur, cos_bit_col); - round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]); - } - for (int i = 0; i < buf_size_w_div16; i++) { - lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, - stride, ud_flip, txfm_size_row); - } -} - -static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, - int stride, int shift, int height, - int txw_idx, int rect_type) { - const int32_t *input_row = input; - const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); - const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + - (1 << (NewSqrt2Bits - shift - 1))); - const __m256i one = _mm256_set1_epi16(1); - const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); - if (rect_type != 1 && rect_type != -1) { - for (int i = 0; i < height; ++i) { - const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); - input_row += stride; - __m256i lo = _mm256_unpacklo_epi16(src, one); - __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale__r); - hi = _mm256_madd_epi16(hi, scale__r); - lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); - hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); - out[i] = _mm256_packs_epi32(lo, hi); - } - } else { - const __m256i rect_scale = - _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); - for (int i = 0; i < height; ++i) { - __m256i src = load_32bit_to_16bit_w16_avx2(input_row); - src = _mm256_mulhrs_epi16(src, rect_scale); - input_row += stride; - __m256i lo = _mm256_unpacklo_epi16(src, one); - __m256i hi = _mm256_unpackhi_epi16(src, one); - lo = _mm256_madd_epi16(lo, scale__r); - hi = _mm256_madd_epi16(hi, scale__r); - lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); - hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); - out[i] = _mm256_packs_epi32(lo, hi); - } - } -} - -static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, - __m256i *buf, int shift, int height, - int txh_idx) { - const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); - const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); - const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); - const __m256i one = _mm256_set1_epi16(1); - const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); - for (int h = 0; h < height; ++h) { - __m256i lo = _mm256_unpacklo_epi16(buf[h], one); - __m256i hi = _mm256_unpackhi_epi16(buf[h], one); - lo = _mm256_madd_epi16(lo, scale_coeff); - hi = _mm256_madd_epi16(hi, scale_coeff); - lo = _mm256_srai_epi32(lo, NewSqrt2Bits); - hi = _mm256_srai_epi32(hi, NewSqrt2Bits); - lo = _mm256_add_epi32(lo, shift__r); - hi = _mm256_add_epi32(hi, shift__r); - lo = _mm256_srai_epi32(lo, -shift); - hi = _mm256_srai_epi32(hi, -shift); - const __m256i x = _mm256_packs_epi32(lo, hi); - write_recon_w16_avx2(x, output); - output += stride; - } -} - -static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, - uint8_t *output, int stride, - TX_SIZE tx_size, - int32_t eob) { - (void)eob; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int input_stride = AOMMIN(32, txfm_size_col); - const int row_max = AOMMIN(32, txfm_size_row); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - __m256i buf[32]; - for (int i = 0; i < input_stride; i += 16) { - iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max, - txw_idx, rect_type); - iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max, - txh_idx); - } -} - -static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - int eobx, eoby; - get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col); - const int input_stride = txfm_size_col_notzero; - const int buf_size_w_div16 = (eobx + 16) >> 4; - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_avx2 col_txfm = - lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_w_div16; i++) { - __m256i buf0[64]; - iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0], - eoby + 1, txw_idx, rect_type); - col_txfm(buf0, buf0, cos_bit_col); - __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); - int k = ud_flip ? (txfm_size_row - 1) : 0; - const int step = ud_flip ? -1 : 1; - for (int j = 0; j < txfm_size_row; ++j, k += step) { - __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); - write_recon_w16_avx2(res, output + (i << 4) + j * stride); - } - } -} - -static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - __m256i buf1[64]; - int eobx, eoby; - get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div16 = txfm_size_col >> 4; - const int buf_size_h_div16 = (eoby + 16) >> 4; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const transform_1d_avx2 row_txfm = - lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - - assert(row_txfm != NULL); - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_h_div16; i++) { - __m256i buf0[64]; - const int32_t *input_row = input + i * input_stride * 16; - for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) { - __m256i *buf0_cur = buf0 + j * 16; - load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride, - buf0_cur, 16); - transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); - } - if (rect_type == 1 || rect_type == -1) { - round_shift_avx2(buf0, buf0, input_stride); // rect special code - } - row_txfm(buf0, buf0, cos_bit_row); - round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); - __m256i *_buf1 = buf1; - if (lr_flip) { - for (int j = 0; j < buf_size_w_div16; ++j) { - __m256i temp[16]; - flip_buf_avx2(buf0 + 16 * j, temp, 16); - transpose_16bit_16x16_avx2(temp, - _buf1 + 16 * (buf_size_w_div16 - 1 - j)); - } - } else { - for (int j = 0; j < buf_size_w_div16; ++j) { - transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); - } - } - for (int j = 0; j < buf_size_w_div16; ++j) { - iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, - buf1 + j * 16, shift[1], 16, txh_idx); - } - } -} - -// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 -static INLINE void lowbd_inv_txfm2d_add_universe_avx2( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - (void)eob; - switch (tx_type) { - case DCT_DCT: - case ADST_DCT: // ADST in vertical, DCT in horizontal - case DCT_ADST: // DCT in vertical, ADST in horizontal - case ADST_ADST: // ADST in both directions - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, - tx_size, eob); - break; - case IDTX: - lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); - break; - case V_DCT: - case V_ADST: - case V_FLIPADST: - lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, - tx_size, eob); - break; - case H_DCT: - case H_ADST: - case H_FLIPADST: - lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, - tx_size, eob); - break; - default: - av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - } -} - -void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, TX_SIZE tx_size, - int eob) { - switch (tx_size) { - case TX_4X4: - case TX_8X8: - case TX_4X8: - case TX_8X4: - case TX_8X16: - case TX_16X8: - case TX_4X16: - case TX_16X4: - case TX_8X32: - case TX_32X8: - av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - case TX_16X16: - case TX_32X32: - case TX_64X64: - case TX_16X32: - case TX_32X16: - case TX_32X64: - case TX_64X32: - case TX_16X64: - case TX_64X16: - default: - lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, - tx_size, eob); - break; - } -} - -void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, - const TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; - if (!txfm_param->lossless) { - av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, - txfm_param->tx_size, txfm_param->eob); - } else { - av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); - } -} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h deleted file mode 100644 index f74cbaeaa..000000000 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ -#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ - -#include <immintrin.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/x86/transpose_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// half input is zero -#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ - { \ - const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \ - const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \ - const __m256i _in = in; \ - out0 = _mm256_mulhrs_epi16(_in, _w0); \ - out1 = _mm256_mulhrs_epi16(_in, _w1); \ - } - -static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, - int size) { - const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); - for (int i = 0; i < size; ++i) { - output[i] = _mm256_mulhrs_epi16(input[i], scale); - } -} - -static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) { - __m128i pred = _mm_loadu_si128((__m128i const *)(output)); - __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); - __m128i y = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168)); - _mm_storeu_si128((__m128i *)(output), y); -} - -static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, - int stride, int flipud, - int height) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - for (int i = 0; i < height; ++i, j += step) { - write_recon_w16_avx2(in[j], output + i * stride); - } -} - -void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, TX_SIZE tx_size, - int eob); -#ifdef __cplusplus -} -#endif - -#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c deleted file mode 100644 index 995bc3da4..000000000 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c +++ /dev/null @@ -1,2923 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "av1/common/av1_inv_txfm1d_cfg.h" -#include "av1/common/x86/av1_inv_txfm_ssse3.h" -#include "av1/common/x86/av1_txfm_sse2.h" - -// TODO(venkatsanampudi@ittiam.com): move this to header file - -// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 -static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, - 4 * 5793 }; - -// TODO(binpengsmail@gmail.com): replace some for loop with do {} while - -static void idct4_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - - // stage 1 - __m128i x[4]; - x[0] = input[0]; - x[1] = input[2]; - x[2] = input[1]; - x[3] = input[3]; - - // stage 2 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - - // stage 3 - btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); - btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); -} - -void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - - // stage 1 - __m128i x[4]; - x[0] = input[0]; - x[1] = input[2]; - x[2] = input[1]; - x[3] = input[3]; - - // stage 2 - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - - // stage 3 - btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); - btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); -} - -void idct8_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m128i x[2]; - x[0] = input[0]; - - // stage 2 - // stage 3 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 4 - // stage 5 - output[0] = x[0]; - output[7] = x[0]; - output[1] = x[1]; - output[6] = x[1]; - output[2] = x[1]; - output[5] = x[1]; - output[3] = x[0]; - output[4] = x[0]; -} - -void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[8]; - x[0] = input[0]; - x[1] = input[4]; - x[2] = input[2]; - x[3] = input[6]; - x[4] = input[1]; - x[5] = input[5]; - x[6] = input[3]; - x[7] = input[7]; - - // stage 2 - btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); - btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); - - // stage 3 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - - // stage 4 - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - - // stage 5 - btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); - btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); - btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); - btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); -} - -void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[8]; - x[0] = input[0]; - x[1] = input[4]; - x[2] = input[2]; - x[3] = input[6]; - x[4] = input[1]; - x[5] = input[5]; - x[6] = input[3]; - x[7] = input[7]; - - // stage 2 - btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); - btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); - - // stage 3 - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - - // stage 4 - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - - // stage 5 - btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); - btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); - btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); - btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); -} - -static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[11]); - btf_16_adds_subs_sse2(x[9], x[10]); - btf_16_subs_adds_sse2(x[15], x[12]); - btf_16_subs_adds_sse2(x[14], x[13]); -} - -static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[7]); - btf_16_adds_subs_sse2(x[1], x[6]); - btf_16_adds_subs_sse2(x[2], x[5]); - btf_16_adds_subs_sse2(x[3], x[4]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); -} - -static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) { - btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); - btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); - btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); - btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); - btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); - btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); - btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); - btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); -} - -static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m128i x[2]; - x[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 5 - // stage 6 - // stage 7 - output[0] = x[0]; - output[15] = x[0]; - output[1] = x[1]; - output[14] = x[1]; - output[2] = x[1]; - output[13] = x[1]; - output[3] = x[0]; - output[12] = x[0]; - output[4] = x[0]; - output[11] = x[0]; - output[5] = x[1]; - output[10] = x[1]; - output[6] = x[1]; - output[9] = x[1]; - output[7] = x[0]; - output[8] = x[0]; -} - -static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - - // stage 1 - __m128i x[16]; - x[0] = input[0]; - x[2] = input[4]; - x[4] = input[2]; - x[6] = input[6]; - x[8] = input[1]; - x[10] = input[5]; - x[12] = input[3]; - x[14] = input[7]; - - // stage 2 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); - btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); - btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); - - // stage 3 - btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); - btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - - // stage 4 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - - idct16_stage5_sse2(x, cospi, __rounding, cos_bit); - idct16_stage6_sse2(x, cospi, __rounding, cos_bit); - idct16_stage7_sse2(output, x); -} - -void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); - const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); - const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); - const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); - const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); - const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); - const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); - const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - - // stage 1 - __m128i x[16]; - x[0] = input[0]; - x[1] = input[8]; - x[2] = input[4]; - x[3] = input[12]; - x[4] = input[2]; - x[5] = input[10]; - x[6] = input[6]; - x[7] = input[14]; - x[8] = input[1]; - x[9] = input[9]; - x[10] = input[5]; - x[11] = input[13]; - x[12] = input[3]; - x[13] = input[11]; - x[14] = input[7]; - x[15] = input[15]; - - // stage 2 - btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); - btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); - - // stage 3 - btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); - btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - - // stage 4 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - - // stage 5~7 - idct16_stage5_sse2(x, cospi, __rounding, cos_bit); - idct16_stage6_sse2(x, cospi, __rounding, cos_bit); - idct16_stage7_sse2(output, x); -} - -void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); - const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); - const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); - const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); - const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); - const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); - const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); - const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[16]; - x[0] = input[0]; - x[1] = input[8]; - x[2] = input[4]; - x[3] = input[12]; - x[4] = input[2]; - x[5] = input[10]; - x[6] = input[6]; - x[7] = input[14]; - x[8] = input[1]; - x[9] = input[9]; - x[10] = input[5]; - x[11] = input[13]; - x[12] = input[3]; - x[13] = input[11]; - x[14] = input[7]; - x[15] = input[15]; - - // stage 2 - btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); - btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); - btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); - btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); - - // stage 3 - btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); - btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - - // stage 4 - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - - // stage 5 - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[11]); - btf_16_adds_subs_sse2(x[9], x[10]); - btf_16_subs_adds_sse2(x[15], x[12]); - btf_16_subs_adds_sse2(x[14], x[13]); - - // stage 6 - btf_16_adds_subs_sse2(x[0], x[7]); - btf_16_adds_subs_sse2(x[1], x[6]); - btf_16_adds_subs_sse2(x[2], x[5]); - btf_16_adds_subs_sse2(x[3], x[4]); - btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - - // stage 7 - idct16_stage7_sse2(output, x); -} - -static INLINE void idct32_high16_stage3_sse2(__m128i *x) { - btf_16_adds_subs_sse2(x[16], x[17]); - btf_16_subs_adds_sse2(x[19], x[18]); - btf_16_adds_subs_sse2(x[20], x[21]); - btf_16_subs_adds_sse2(x[23], x[22]); - btf_16_adds_subs_sse2(x[24], x[25]); - btf_16_subs_adds_sse2(x[27], x[26]); - btf_16_adds_subs_sse2(x[28], x[29]); - btf_16_subs_adds_sse2(x[31], x[30]); -} - -static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); - const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); - const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); - const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); - const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); - const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); - btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); -} - -static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - btf_16_adds_subs_sse2(x[16], x[19]); - btf_16_adds_subs_sse2(x[17], x[18]); - btf_16_subs_adds_sse2(x[23], x[20]); - btf_16_subs_adds_sse2(x[22], x[21]); - btf_16_adds_subs_sse2(x[24], x[27]); - btf_16_adds_subs_sse2(x[25], x[26]); - btf_16_subs_adds_sse2(x[31], x[28]); - btf_16_subs_adds_sse2(x[30], x[29]); -} - -static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[11]); - btf_16_adds_subs_sse2(x[9], x[10]); - btf_16_subs_adds_sse2(x[15], x[12]); - btf_16_subs_adds_sse2(x[14], x[13]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); -} - -static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[7]); - btf_16_adds_subs_sse2(x[1], x[6]); - btf_16_adds_subs_sse2(x[2], x[5]); - btf_16_adds_subs_sse2(x[3], x[4]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - btf_16_adds_subs_sse2(x[16], x[23]); - btf_16_adds_subs_sse2(x[17], x[22]); - btf_16_adds_subs_sse2(x[18], x[21]); - btf_16_adds_subs_sse2(x[19], x[20]); - btf_16_subs_adds_sse2(x[31], x[24]); - btf_16_subs_adds_sse2(x[30], x[25]); - btf_16_subs_adds_sse2(x[29], x[26]); - btf_16_subs_adds_sse2(x[28], x[27]); -} - -static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[15]); - btf_16_adds_subs_sse2(x[1], x[14]); - btf_16_adds_subs_sse2(x[2], x[13]); - btf_16_adds_subs_sse2(x[3], x[12]); - btf_16_adds_subs_sse2(x[4], x[11]); - btf_16_adds_subs_sse2(x[5], x[10]); - btf_16_adds_subs_sse2(x[6], x[9]); - btf_16_adds_subs_sse2(x[7], x[8]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); -} - -static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) { - btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); - btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); - btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); - btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); - btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); - btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); - btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); - btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); - btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); - btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); - btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); - btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); - btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); - btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); - btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); - btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); -} - -static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m128i x[2]; - x[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - // stage 5 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 6 - // stage 7 - // stage 8 - // stage 9 - output[0] = x[0]; - output[31] = x[0]; - output[1] = x[1]; - output[30] = x[1]; - output[2] = x[1]; - output[29] = x[1]; - output[3] = x[0]; - output[28] = x[0]; - output[4] = x[0]; - output[27] = x[0]; - output[5] = x[1]; - output[26] = x[1]; - output[6] = x[1]; - output[25] = x[1]; - output[7] = x[0]; - output[24] = x[0]; - output[8] = x[0]; - output[23] = x[0]; - output[9] = x[1]; - output[22] = x[1]; - output[10] = x[1]; - output[21] = x[1]; - output[11] = x[0]; - output[20] = x[0]; - output[12] = x[0]; - output[19] = x[0]; - output[13] = x[1]; - output[18] = x[1]; - output[14] = x[1]; - output[17] = x[1]; - output[15] = x[0]; - output[16] = x[0]; -} - -static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m128i x[32]; - x[0] = input[0]; - x[4] = input[4]; - x[8] = input[2]; - x[12] = input[6]; - x[16] = input[1]; - x[20] = input[5]; - x[24] = input[3]; - x[28] = input[7]; - - // stage 2 - btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); - - // stage 3 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); - x[17] = x[16]; - x[18] = x[19]; - x[21] = x[20]; - x[22] = x[23]; - x[25] = x[24]; - x[26] = x[27]; - x[29] = x[28]; - x[30] = x[31]; - - // stage 4 - btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); - x[9] = x[8]; - x[10] = x[11]; - x[13] = x[12]; - x[14] = x[15]; - idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); - - // stage 5 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - x[5] = x[4]; - x[6] = x[7]; - idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); - // stage 6 - x[3] = x[0]; - x[2] = x[1]; - idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); - - idct32_stage7_sse2(x, cospi, __rounding, cos_bit); - idct32_stage8_sse2(x, cospi, __rounding, cos_bit); - idct32_stage9_sse2(output, x); -} - -static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m128i x[32]; - x[0] = input[0]; - x[2] = input[8]; - x[4] = input[4]; - x[6] = input[12]; - x[8] = input[2]; - x[10] = input[10]; - x[12] = input[6]; - x[14] = input[14]; - x[16] = input[1]; - x[18] = input[9]; - x[20] = input[5]; - x[22] = input[13]; - x[24] = input[3]; - x[26] = input[11]; - x[28] = input[7]; - x[30] = input[15]; - - // stage 2 - btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); - btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); - btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); - btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); - btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); - - // stage 3 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); - btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); - btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); - idct32_high16_stage3_sse2(x); - - // stage 4 - btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); - btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); - - // stage 5 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); - - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); - - idct32_stage7_sse2(x, cospi, __rounding, cos_bit); - idct32_stage8_sse2(x, cospi, __rounding, cos_bit); - idct32_stage9_sse2(output, x); -} - -static void idct32_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); - const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); - const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); - const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); - const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); - const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); - const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); - const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); - const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); - const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); - const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); - const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); - const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); - const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); - const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); - const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); - const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); - const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); - const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); - const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); - const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); - const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); - const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); - const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - - // stage 1 - __m128i x[32]; - x[0] = input[0]; - x[1] = input[16]; - x[2] = input[8]; - x[3] = input[24]; - x[4] = input[4]; - x[5] = input[20]; - x[6] = input[12]; - x[7] = input[28]; - x[8] = input[2]; - x[9] = input[18]; - x[10] = input[10]; - x[11] = input[26]; - x[12] = input[6]; - x[13] = input[22]; - x[14] = input[14]; - x[15] = input[30]; - x[16] = input[1]; - x[17] = input[17]; - x[18] = input[9]; - x[19] = input[25]; - x[20] = input[5]; - x[21] = input[21]; - x[22] = input[13]; - x[23] = input[29]; - x[24] = input[3]; - x[25] = input[19]; - x[26] = input[11]; - x[27] = input[27]; - x[28] = input[7]; - x[29] = input[23]; - x[30] = input[15]; - x[31] = input[31]; - - // stage 2 - btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); - btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); - btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); - btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); - btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); - btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); - btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); - btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); - - // stage 3 - btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); - btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); - idct32_high16_stage3_sse2(x); - - // stage 4 - btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); - btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); - - // stage 5 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_adds_subs_sse2(x[7], x[6]); - idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); - - // stage 6 - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); - - // stage 7~8 - idct32_stage7_sse2(x, cospi, __rounding, cos_bit); - idct32_stage8_sse2(x, cospi, __rounding, cos_bit); - idct32_stage9_sse2(output, x); -} - -static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); - const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); - const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); - const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); - const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); - const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); - const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); - const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); - const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); - const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); - const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); - const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); - btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); - btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); - btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); - btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); - btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); -} - -static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); - const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); - const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); - const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); - const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); - const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); - btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); - btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); - btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); - btf_16_adds_subs_sse2(x[32], x[35]); - btf_16_adds_subs_sse2(x[33], x[34]); - btf_16_subs_adds_sse2(x[39], x[36]); - btf_16_subs_adds_sse2(x[38], x[37]); - btf_16_adds_subs_sse2(x[40], x[43]); - btf_16_adds_subs_sse2(x[41], x[42]); - btf_16_subs_adds_sse2(x[47], x[44]); - btf_16_subs_adds_sse2(x[46], x[45]); - btf_16_adds_subs_sse2(x[48], x[51]); - btf_16_adds_subs_sse2(x[49], x[50]); - btf_16_subs_adds_sse2(x[55], x[52]); - btf_16_subs_adds_sse2(x[54], x[53]); - btf_16_adds_subs_sse2(x[56], x[59]); - btf_16_adds_subs_sse2(x[57], x[58]); - btf_16_subs_adds_sse2(x[63], x[60]); - btf_16_subs_adds_sse2(x[62], x[61]); -} - -static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); - const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); - const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); - const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); - const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); - const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); - btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); - btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); - btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); - btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); - btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); - btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); - btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); - btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); -} - -static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - btf_16_adds_subs_sse2(x[16], x[19]); - btf_16_adds_subs_sse2(x[17], x[18]); - btf_16_subs_adds_sse2(x[23], x[20]); - btf_16_subs_adds_sse2(x[22], x[21]); - btf_16_adds_subs_sse2(x[24], x[27]); - btf_16_adds_subs_sse2(x[25], x[26]); - btf_16_subs_adds_sse2(x[31], x[28]); - btf_16_subs_adds_sse2(x[30], x[29]); - idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); -} - -static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); - btf_16_adds_subs_sse2(x[32], x[39]); - btf_16_adds_subs_sse2(x[33], x[38]); - btf_16_adds_subs_sse2(x[34], x[37]); - btf_16_adds_subs_sse2(x[35], x[36]); - btf_16_subs_adds_sse2(x[47], x[40]); - btf_16_subs_adds_sse2(x[46], x[41]); - btf_16_subs_adds_sse2(x[45], x[42]); - btf_16_subs_adds_sse2(x[44], x[43]); - btf_16_adds_subs_sse2(x[48], x[55]); - btf_16_adds_subs_sse2(x[49], x[54]); - btf_16_adds_subs_sse2(x[50], x[53]); - btf_16_adds_subs_sse2(x[51], x[52]); - btf_16_subs_adds_sse2(x[63], x[56]); - btf_16_subs_adds_sse2(x[62], x[57]); - btf_16_subs_adds_sse2(x[61], x[58]); - btf_16_subs_adds_sse2(x[60], x[59]); -} - -static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - btf_16_adds_subs_sse2(x[16], x[23]); - btf_16_adds_subs_sse2(x[17], x[22]); - btf_16_adds_subs_sse2(x[18], x[21]); - btf_16_adds_subs_sse2(x[19], x[20]); - btf_16_subs_adds_sse2(x[31], x[24]); - btf_16_subs_adds_sse2(x[30], x[25]); - btf_16_subs_adds_sse2(x[29], x[26]); - btf_16_subs_adds_sse2(x[28], x[27]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); -} - -static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[15]); - btf_16_adds_subs_sse2(x[1], x[14]); - btf_16_adds_subs_sse2(x[2], x[13]); - btf_16_adds_subs_sse2(x[3], x[12]); - btf_16_adds_subs_sse2(x[4], x[11]); - btf_16_adds_subs_sse2(x[5], x[10]); - btf_16_adds_subs_sse2(x[6], x[9]); - btf_16_adds_subs_sse2(x[7], x[8]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); - btf_16_adds_subs_sse2(x[32], x[47]); - btf_16_adds_subs_sse2(x[33], x[46]); - btf_16_adds_subs_sse2(x[34], x[45]); - btf_16_adds_subs_sse2(x[35], x[44]); - btf_16_adds_subs_sse2(x[36], x[43]); - btf_16_adds_subs_sse2(x[37], x[42]); - btf_16_adds_subs_sse2(x[38], x[41]); - btf_16_adds_subs_sse2(x[39], x[40]); - btf_16_subs_adds_sse2(x[63], x[48]); - btf_16_subs_adds_sse2(x[62], x[49]); - btf_16_subs_adds_sse2(x[61], x[50]); - btf_16_subs_adds_sse2(x[60], x[51]); - btf_16_subs_adds_sse2(x[59], x[52]); - btf_16_subs_adds_sse2(x[58], x[53]); - btf_16_subs_adds_sse2(x[57], x[54]); - btf_16_subs_adds_sse2(x[56], x[55]); -} - -static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - btf_16_adds_subs_sse2(x[0], x[31]); - btf_16_adds_subs_sse2(x[1], x[30]); - btf_16_adds_subs_sse2(x[2], x[29]); - btf_16_adds_subs_sse2(x[3], x[28]); - btf_16_adds_subs_sse2(x[4], x[27]); - btf_16_adds_subs_sse2(x[5], x[26]); - btf_16_adds_subs_sse2(x[6], x[25]); - btf_16_adds_subs_sse2(x[7], x[24]); - btf_16_adds_subs_sse2(x[8], x[23]); - btf_16_adds_subs_sse2(x[9], x[22]); - btf_16_adds_subs_sse2(x[10], x[21]); - btf_16_adds_subs_sse2(x[11], x[20]); - btf_16_adds_subs_sse2(x[12], x[19]); - btf_16_adds_subs_sse2(x[13], x[18]); - btf_16_adds_subs_sse2(x[14], x[17]); - btf_16_adds_subs_sse2(x[15], x[16]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); -} - -static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) { - btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); - btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); - btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); - btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); - btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); - btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); - btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); - btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); - btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); - btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); - btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); - btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); - btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); - btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); - btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); - btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); - btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); - btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); - btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); - btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); - btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); - btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); - btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); - btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); - btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); - btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); - btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); - btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); - btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); - btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); - btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); - btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); -} - -static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - - // stage 1 - __m128i x[32]; - x[0] = input[0]; - - // stage 2 - // stage 3 - // stage 4 - // stage 5 - // stage 6 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - - // stage 7 - // stage 8 - // stage 9 - // stage 10 - // stage 11 - output[0] = x[0]; - output[63] = x[0]; - output[1] = x[1]; - output[62] = x[1]; - output[2] = x[1]; - output[61] = x[1]; - output[3] = x[0]; - output[60] = x[0]; - output[4] = x[0]; - output[59] = x[0]; - output[5] = x[1]; - output[58] = x[1]; - output[6] = x[1]; - output[57] = x[1]; - output[7] = x[0]; - output[56] = x[0]; - output[8] = x[0]; - output[55] = x[0]; - output[9] = x[1]; - output[54] = x[1]; - output[10] = x[1]; - output[53] = x[1]; - output[11] = x[0]; - output[52] = x[0]; - output[12] = x[0]; - output[51] = x[0]; - output[13] = x[1]; - output[50] = x[1]; - output[14] = x[1]; - output[49] = x[1]; - output[15] = x[0]; - output[48] = x[0]; - output[16] = x[0]; - output[47] = x[0]; - output[17] = x[1]; - output[46] = x[1]; - output[18] = x[1]; - output[45] = x[1]; - output[19] = x[0]; - output[44] = x[0]; - output[20] = x[0]; - output[43] = x[0]; - output[21] = x[1]; - output[42] = x[1]; - output[22] = x[1]; - output[41] = x[1]; - output[23] = x[0]; - output[40] = x[0]; - output[24] = x[0]; - output[39] = x[0]; - output[25] = x[1]; - output[38] = x[1]; - output[26] = x[1]; - output[37] = x[1]; - output[27] = x[0]; - output[36] = x[0]; - output[28] = x[0]; - output[35] = x[0]; - output[29] = x[1]; - output[34] = x[1]; - output[30] = x[1]; - output[33] = x[1]; - output[31] = x[0]; - output[32] = x[0]; -} - -static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); - const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); - const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); - const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); - const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); - const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); - const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); - const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); - const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); - const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); - const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); - const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[64]; - x[0] = input[0]; - x[8] = input[4]; - x[16] = input[2]; - x[24] = input[6]; - x[32] = input[1]; - x[40] = input[5]; - x[48] = input[3]; - x[56] = input[7]; - - // stage 2 - btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); - x[33] = x[32]; - x[38] = x[39]; - x[41] = x[40]; - x[46] = x[47]; - x[49] = x[48]; - x[54] = x[55]; - x[57] = x[56]; - x[62] = x[63]; - - // stage 4 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - x[17] = x[16]; - x[22] = x[23]; - x[25] = x[24]; - x[30] = x[31]; - btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); - btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); - btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); - btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); - - // stage 5 - x[9] = x[8]; - x[14] = x[15]; - btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); - btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); - x[35] = x[32]; - x[34] = x[33]; - x[36] = x[39]; - x[37] = x[38]; - x[43] = x[40]; - x[42] = x[41]; - x[44] = x[47]; - x[45] = x[46]; - x[51] = x[48]; - x[50] = x[49]; - x[52] = x[55]; - x[53] = x[54]; - x[59] = x[56]; - x[58] = x[57]; - x[60] = x[63]; - x[61] = x[62]; - - // stage 6 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - x[19] = x[16]; - x[18] = x[17]; - x[20] = x[23]; - x[21] = x[22]; - x[27] = x[24]; - x[26] = x[25]; - x[28] = x[31]; - x[29] = x[30]; - idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); - - // stage 7 - x[3] = x[0]; - x[2] = x[1]; - x[11] = x[8]; - x[10] = x[9]; - x[12] = x[15]; - x[13] = x[14]; - idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 8 - x[7] = x[0]; - x[6] = x[1]; - x[5] = x[2]; - x[4] = x[3]; - x[9] = x[9]; - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); - - idct64_stage9_sse2(x, cospi, __rounding, cos_bit); - idct64_stage10_sse2(x, cospi, __rounding, cos_bit); - idct64_stage11_sse2(output, x); -} - -static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[64]; - x[0] = input[0]; - x[4] = input[8]; - x[8] = input[4]; - x[12] = input[12]; - x[16] = input[2]; - x[20] = input[10]; - x[24] = input[6]; - x[28] = input[14]; - x[32] = input[1]; - x[36] = input[9]; - x[40] = input[5]; - x[44] = input[13]; - x[48] = input[3]; - x[52] = input[11]; - x[56] = input[7]; - x[60] = input[15]; - - // stage 2 - btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); - btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); - btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); - btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); - btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); - x[33] = x[32]; - x[34] = x[35]; - x[37] = x[36]; - x[38] = x[39]; - x[41] = x[40]; - x[42] = x[43]; - x[45] = x[44]; - x[46] = x[47]; - x[49] = x[48]; - x[50] = x[51]; - x[53] = x[52]; - x[54] = x[55]; - x[57] = x[56]; - x[58] = x[59]; - x[61] = x[60]; - x[62] = x[63]; - - // stage 4 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); - x[17] = x[16]; - x[18] = x[19]; - x[21] = x[20]; - x[22] = x[23]; - x[25] = x[24]; - x[26] = x[27]; - x[29] = x[28]; - x[30] = x[31]; - idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); - - // stage 5 - btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); - x[9] = x[8]; - x[10] = x[11]; - x[13] = x[12]; - x[14] = x[15]; - idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 6 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - x[5] = x[4]; - x[6] = x[7]; - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 7 - x[3] = x[0]; - x[2] = x[1]; - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[11]); - btf_16_adds_subs_sse2(x[9], x[10]); - btf_16_subs_adds_sse2(x[15], x[12]); - btf_16_subs_adds_sse2(x[14], x[13]); - idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 8 - btf_16_adds_subs_sse2(x[0], x[7]); - btf_16_adds_subs_sse2(x[1], x[6]); - btf_16_adds_subs_sse2(x[2], x[5]); - btf_16_adds_subs_sse2(x[3], x[4]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); - - idct64_stage9_sse2(x, cospi, __rounding, cos_bit); - idct64_stage10_sse2(x, cospi, __rounding, cos_bit); - idct64_stage11_sse2(output, x); -} - -static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); - const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); - const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); - const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); - - // stage 1 - __m128i x[64]; - x[0] = input[0]; - x[2] = input[16]; - x[4] = input[8]; - x[6] = input[24]; - x[8] = input[4]; - x[10] = input[20]; - x[12] = input[12]; - x[14] = input[28]; - x[16] = input[2]; - x[18] = input[18]; - x[20] = input[10]; - x[22] = input[26]; - x[24] = input[6]; - x[26] = input[22]; - x[28] = input[14]; - x[30] = input[30]; - x[32] = input[1]; - x[34] = input[17]; - x[36] = input[9]; - x[38] = input[25]; - x[40] = input[5]; - x[42] = input[21]; - x[44] = input[13]; - x[46] = input[29]; - x[48] = input[3]; - x[50] = input[19]; - x[52] = input[11]; - x[54] = input[27]; - x[56] = input[7]; - x[58] = input[23]; - x[60] = input[15]; - x[62] = input[31]; - - // stage 2 - btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); - btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); - btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); - btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); - btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); - btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); - btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); - btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); - btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); - btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); - btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); - btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); - btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); - btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); - btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); - btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); - - // stage 3 - btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); - btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); - btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); - btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); - btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); - btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); - btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); - btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); - btf_16_adds_subs_sse2(x[32], x[33]); - btf_16_subs_adds_sse2(x[35], x[34]); - btf_16_adds_subs_sse2(x[36], x[37]); - btf_16_subs_adds_sse2(x[39], x[38]); - btf_16_adds_subs_sse2(x[40], x[41]); - btf_16_subs_adds_sse2(x[43], x[42]); - btf_16_adds_subs_sse2(x[44], x[45]); - btf_16_subs_adds_sse2(x[47], x[46]); - btf_16_adds_subs_sse2(x[48], x[49]); - btf_16_subs_adds_sse2(x[51], x[50]); - btf_16_adds_subs_sse2(x[52], x[53]); - btf_16_subs_adds_sse2(x[55], x[54]); - btf_16_adds_subs_sse2(x[56], x[57]); - btf_16_subs_adds_sse2(x[59], x[58]); - btf_16_adds_subs_sse2(x[60], x[61]); - btf_16_subs_adds_sse2(x[63], x[62]); - - // stage 4 - btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); - btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); - btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); - btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); - btf_16_adds_subs_sse2(x[16], x[17]); - btf_16_subs_adds_sse2(x[19], x[18]); - btf_16_adds_subs_sse2(x[20], x[21]); - btf_16_subs_adds_sse2(x[23], x[22]); - btf_16_adds_subs_sse2(x[24], x[25]); - btf_16_subs_adds_sse2(x[27], x[26]); - btf_16_adds_subs_sse2(x[28], x[29]); - btf_16_subs_adds_sse2(x[31], x[30]); - idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); - - // stage 5 - btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); - btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[9]); - btf_16_subs_adds_sse2(x[11], x[10]); - btf_16_adds_subs_sse2(x[12], x[13]); - btf_16_subs_adds_sse2(x[15], x[14]); - idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 6 - btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); - btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); - btf_16_adds_subs_sse2(x[4], x[5]); - btf_16_subs_adds_sse2(x[7], x[6]); - btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); - btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); - idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 7 - btf_16_adds_subs_sse2(x[0], x[3]); - btf_16_adds_subs_sse2(x[1], x[2]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); - btf_16_adds_subs_sse2(x[8], x[11]); - btf_16_adds_subs_sse2(x[9], x[10]); - btf_16_subs_adds_sse2(x[15], x[12]); - btf_16_subs_adds_sse2(x[14], x[13]); - idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 8 - btf_16_adds_subs_sse2(x[0], x[7]); - btf_16_adds_subs_sse2(x[1], x[6]); - btf_16_adds_subs_sse2(x[2], x[5]); - btf_16_adds_subs_sse2(x[3], x[4]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); - btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); - idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); - - // stage 9~11 - idct64_stage9_sse2(x, cospi, __rounding, cos_bit); - idct64_stage10_sse2(x, cospi, __rounding, cos_bit); - idct64_stage11_sse2(output, x); -} - -void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *sinpi = sinpi_arr(INV_COS_BIT); - const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); - const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); - const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); - const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); - const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); - const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); - const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); - const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); - __m128i x0[4]; - x0[0] = input[0]; - x0[1] = input[1]; - x0[2] = input[2]; - x0[3] = input[3]; - - __m128i u[4]; - u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); - u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); - u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); - u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); - - __m128i x1[16]; - x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 - x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); - x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 - x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); - x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 - x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); - x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 - x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); - x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 - x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); - x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 - x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); - x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 - x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); - x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 - x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); - - __m128i x2[8]; - x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 - x2[1] = _mm_add_epi32(x1[1], x1[5]); - x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 - x2[3] = _mm_add_epi32(x1[3], x1[7]); - x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 - x2[5] = _mm_add_epi32(x1[9], x1[11]); - x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 - x2[7] = _mm_add_epi32(x1[13], x1[15]); - - const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - for (int i = 0; i < 4; ++i) { - __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); - __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); - out0 = _mm_srai_epi32(out0, INV_COS_BIT); - out1 = _mm_srai_epi32(out1, INV_COS_BIT); - output[i] = _mm_packs_epi32(out0, out1); - } -} - -// TODO(binpengsmail@gmail.com): -// To explore the reuse of VP9 versions of corresponding SSE2 functions and -// evaluate whether there is a possibility for further speedup. -void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *sinpi = sinpi_arr(INV_COS_BIT); - const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); - const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); - const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); - const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); - const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); - const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); - const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); - const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); - __m128i x0[4]; - x0[0] = input[0]; - x0[1] = input[1]; - x0[2] = input[2]; - x0[3] = input[3]; - - __m128i u[2]; - u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); - u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); - - __m128i x1[8]; - x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 - x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 - x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 - x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 - x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 - x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 - x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 - x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 - - __m128i x2[4]; - x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 - x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 - x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 - x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 - - const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - for (int i = 0; i < 4; ++i) { - __m128i out0 = _mm_add_epi32(x2[i], rounding); - out0 = _mm_srai_epi32(out0, INV_COS_BIT); - output[i] = _mm_packs_epi32(out0, out0); - } -} - -static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __zero = _mm_setzero_si128(); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - - // stage 1 - __m128i x[8]; - x[1] = input[0]; - - // stage 2 - btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); - - // stage 3 - x[4] = x[0]; - x[5] = x[1]; - - // stage 4 - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - - // stage 5 - x[2] = x[0]; - x[3] = x[1]; - x[6] = x[4]; - x[7] = x[5]; - - // stage 6 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); - - // stage 7 - output[0] = x[0]; - output[1] = _mm_subs_epi16(__zero, x[4]); - output[2] = x[6]; - output[3] = _mm_subs_epi16(__zero, x[2]); - output[4] = x[3]; - output[5] = _mm_subs_epi16(__zero, x[7]); - output[6] = x[5]; - output[7] = _mm_subs_epi16(__zero, x[1]); -} - -void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __zero = _mm_setzero_si128(); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); - const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); - const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); - const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); - const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); - const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); - const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); - const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - - // stage 1 - __m128i x[8]; - x[0] = input[7]; - x[1] = input[0]; - x[2] = input[5]; - x[3] = input[2]; - x[4] = input[3]; - x[5] = input[4]; - x[6] = input[1]; - x[7] = input[6]; - - // stage 2 - btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); - btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); - btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); - - // stage 3 - btf_16_adds_subs_sse2(x[0], x[4]); - btf_16_adds_subs_sse2(x[1], x[5]); - btf_16_adds_subs_sse2(x[2], x[6]); - btf_16_adds_subs_sse2(x[3], x[7]); - - // stage 4 - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - - // stage 5 - btf_16_adds_subs_sse2(x[0], x[2]); - btf_16_adds_subs_sse2(x[1], x[3]); - btf_16_adds_subs_sse2(x[4], x[6]); - btf_16_adds_subs_sse2(x[5], x[7]); - - // stage 6 - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); - - // stage 7 - output[0] = x[0]; - output[1] = _mm_subs_epi16(__zero, x[4]); - output[2] = x[6]; - output[3] = _mm_subs_epi16(__zero, x[2]); - output[4] = x[3]; - output[5] = _mm_subs_epi16(__zero, x[7]); - output[6] = x[5]; - output[7] = _mm_subs_epi16(__zero, x[1]); -} - -void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __zero = _mm_setzero_si128(); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); - const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); - const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); - const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); - const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); - const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); - const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); - const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - - // stage 1 - __m128i x[8]; - x[0] = input[7]; - x[1] = input[0]; - x[2] = input[5]; - x[3] = input[2]; - x[4] = input[3]; - x[5] = input[4]; - x[6] = input[1]; - x[7] = input[6]; - - // stage 2 - btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); - btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); - btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); - btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); - - // stage 3 - btf_16_adds_subs_sse2(x[0], x[4]); - btf_16_adds_subs_sse2(x[1], x[5]); - btf_16_adds_subs_sse2(x[2], x[6]); - btf_16_adds_subs_sse2(x[3], x[7]); - - // stage 4 - btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - - // stage 5 - btf_16_adds_subs_sse2(x[0], x[2]); - btf_16_adds_subs_sse2(x[1], x[3]); - btf_16_adds_subs_sse2(x[4], x[6]); - btf_16_adds_subs_sse2(x[5], x[7]); - - // stage 6 - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); - - // stage 7 - output[0] = x[0]; - output[1] = _mm_subs_epi16(__zero, x[4]); - output[2] = x[6]; - output[3] = _mm_subs_epi16(__zero, x[2]); - output[4] = x[3]; - output[5] = _mm_subs_epi16(__zero, x[7]); - output[6] = x[5]; - output[7] = _mm_subs_epi16(__zero, x[1]); -} - -static INLINE void iadst16_stage3_ssse3(__m128i *x) { - btf_16_adds_subs_sse2(x[0], x[8]); - btf_16_adds_subs_sse2(x[1], x[9]); - btf_16_adds_subs_sse2(x[2], x[10]); - btf_16_adds_subs_sse2(x[3], x[11]); - btf_16_adds_subs_sse2(x[4], x[12]); - btf_16_adds_subs_sse2(x[5], x[13]); - btf_16_adds_subs_sse2(x[6], x[14]); - btf_16_adds_subs_sse2(x[7], x[15]); -} - -static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); - const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); - btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); - btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); - btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); - btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); -} - -static INLINE void iadst16_stage5_ssse3(__m128i *x) { - btf_16_adds_subs_sse2(x[0], x[4]); - btf_16_adds_subs_sse2(x[1], x[5]); - btf_16_adds_subs_sse2(x[2], x[6]); - btf_16_adds_subs_sse2(x[3], x[7]); - btf_16_adds_subs_sse2(x[8], x[12]); - btf_16_adds_subs_sse2(x[9], x[13]); - btf_16_adds_subs_sse2(x[10], x[14]); - btf_16_adds_subs_sse2(x[11], x[15]); -} - -static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); - btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); -} - -static INLINE void iadst16_stage7_ssse3(__m128i *x) { - btf_16_adds_subs_sse2(x[0], x[2]); - btf_16_adds_subs_sse2(x[1], x[3]); - btf_16_adds_subs_sse2(x[4], x[6]); - btf_16_adds_subs_sse2(x[5], x[7]); - btf_16_adds_subs_sse2(x[8], x[10]); - btf_16_adds_subs_sse2(x[9], x[11]); - btf_16_adds_subs_sse2(x[12], x[14]); - btf_16_adds_subs_sse2(x[13], x[15]); -} - -static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, - const __m128i __rounding, - int8_t cos_bit) { - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); - btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); -} - -static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { - const __m128i __zero = _mm_setzero_si128(); - output[0] = x[0]; - output[1] = _mm_subs_epi16(__zero, x[8]); - output[2] = x[12]; - output[3] = _mm_subs_epi16(__zero, x[4]); - output[4] = x[6]; - output[5] = _mm_subs_epi16(__zero, x[14]); - output[6] = x[10]; - output[7] = _mm_subs_epi16(__zero, x[2]); - output[8] = x[3]; - output[9] = _mm_subs_epi16(__zero, x[11]); - output[10] = x[15]; - output[11] = _mm_subs_epi16(__zero, x[7]); - output[12] = x[5]; - output[13] = _mm_subs_epi16(__zero, x[13]); - output[14] = x[9]; - output[15] = _mm_subs_epi16(__zero, x[1]); -} - -static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - - // stage 1 - __m128i x[16]; - x[1] = input[0]; - - // stage 2 - btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); - - // stage 3 - x[8] = x[0]; - x[9] = x[1]; - - // stage 4 - btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); - - // stage 5 - x[4] = x[0]; - x[5] = x[1]; - x[12] = x[8]; - x[13] = x[9]; - - // stage 6 - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); - - // stage 7 - x[2] = x[0]; - x[3] = x[1]; - x[6] = x[4]; - x[7] = x[5]; - x[10] = x[8]; - x[11] = x[9]; - x[14] = x[12]; - x[15] = x[13]; - - iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage9_ssse3(output, x); -} - -static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - // stage 1 - __m128i x[16]; - x[1] = input[0]; - x[3] = input[2]; - x[5] = input[4]; - x[7] = input[6]; - x[8] = input[7]; - x[10] = input[5]; - x[12] = input[3]; - x[14] = input[1]; - - // stage 2 - btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); - btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); - btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); - btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); - btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); - btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); - btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); - btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); - - // stage 3 - iadst16_stage3_ssse3(x); - iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage5_ssse3(x); - iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage7_ssse3(x); - iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage9_ssse3(output, x); -} -void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); - const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); - const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); - const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); - const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); - const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); - const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); - const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); - const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); - const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); - const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); - const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); - const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); - const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); - const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); - const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); - - // stage 1 - __m128i x[16]; - x[0] = input[15]; - x[1] = input[0]; - x[2] = input[13]; - x[3] = input[2]; - x[4] = input[11]; - x[5] = input[4]; - x[6] = input[9]; - x[7] = input[6]; - x[8] = input[7]; - x[9] = input[8]; - x[10] = input[5]; - x[11] = input[10]; - x[12] = input[3]; - x[13] = input[12]; - x[14] = input[1]; - x[15] = input[14]; - - // stage 2 - btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); - btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); - btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); - btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); - btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); - btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); - btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); - btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); - - // stage 3~9 - iadst16_stage3_ssse3(x); - iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage5_ssse3(x); - iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage7_ssse3(x); - iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); - iadst16_stage9_ssse3(output, x); -} - -void iadst16_w4_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int32_t *cospi = cospi_arr(INV_COS_BIT); - const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); - - const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); - const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); - const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); - const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); - const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); - const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); - const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); - const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); - const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); - const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); - const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); - const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); - const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); - const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); - const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); - const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); - const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); - const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); - const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); - const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); - const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); - const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); - const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); - const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); - const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); - const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); - const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); - - // stage 1 - __m128i x[16]; - x[0] = input[15]; - x[1] = input[0]; - x[2] = input[13]; - x[3] = input[2]; - x[4] = input[11]; - x[5] = input[4]; - x[6] = input[9]; - x[7] = input[6]; - x[8] = input[7]; - x[9] = input[8]; - x[10] = input[5]; - x[11] = input[10]; - x[12] = input[3]; - x[13] = input[12]; - x[14] = input[1]; - x[15] = input[14]; - - // stage 2 - btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); - btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); - btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); - btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); - btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); - btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); - btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); - btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); - - // stage 3 - iadst16_stage3_ssse3(x); - - // stage 4 - btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); - btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); - btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); - btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); - - // stage 5 - iadst16_stage5_ssse3(x); - - // stage 6 - btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); - btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); - btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); - btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); - - // stage 7 - iadst16_stage7_ssse3(x); - - // stage 8 - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); - btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); - - // stage 9 - iadst16_stage9_ssse3(output, x); -} - -static void iidentity4_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); - const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); - for (int i = 0; i < 4; ++i) { - __m128i x = _mm_mulhrs_epi16(input[i], scale); - output[i] = _mm_adds_epi16(x, input[i]); - } -} - -static void iidentity8_new_sse2(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - for (int i = 0; i < 8; ++i) { - output[i] = _mm_adds_epi16(input[i], input[i]); - } -} - -static void iidentity16_new_ssse3(const __m128i *input, __m128i *output, - int8_t cos_bit) { - (void)cos_bit; - const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); - const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); - for (int i = 0; i < 16; ++i) { - __m128i x = _mm_mulhrs_epi16(input[i], scale); - __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); - output[i] = _mm_adds_epi16(x, srcx2); - } -} - -static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, - __m128i res) { - const __m128i zero = _mm_setzero_si128(); - __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); - return _mm_packus_epi16(x0, x0); -} - -static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, - int stride, int flipud, - const int height) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - const __m128i zero = _mm_setzero_si128(); - for (int i = 0; i < height; ++i, j += step) { - const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride))); - __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); - u = _mm_packus_epi16(u, zero); - *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u); - } -} - -static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, - int stride, int flipud, - const int height) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - for (int i = 0; i < height; ++i, j += step) { - const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); - const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); - _mm_storel_epi64((__m128i *)(output + i * stride), u); - } -} - -// 1D functions process process 8 pixels at one time. -static const transform_1d_ssse3 - lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { - { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 }, - { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 }, - { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 }, - { idct32_new_sse2, NULL, NULL }, - { idct64_low32_new_ssse3, NULL, NULL }, - }; - -// functions for blocks with eob at DC and within -// topleft 8x8, 16x16, 32x32 corner -static const transform_1d_ssse3 - lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { - { - { idct4_new_sse2, idct4_new_sse2, NULL, NULL }, - { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL }, - { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL }, - }, - { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL }, - { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL }, - { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } }, - { - { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2, - NULL }, - { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2, - NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3, - idct32_new_sse2 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3, - idct64_low32_new_ssse3 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } } - }; - -// 1D functions process process 4 pixels at one time. -// used in 4x4, 4x8, 4x16, 8x4, 16x4 -static const transform_1d_ssse3 - lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { - { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 }, - { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 }, - { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 }, - { NULL, NULL, NULL }, - { NULL, NULL, NULL }, - }; - -static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, - int stride, int shift, int height, - int txw_idx, int rect_type) { - const int32_t *input_row = input; - const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); - const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + - (1 << (NewSqrt2Bits - shift - 1))); - const __m128i one = _mm_set1_epi16(1); - const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); - if (rect_type != 1 && rect_type != -1) { - for (int i = 0; i < height; ++i) { - const __m128i src = load_32bit_to_16bit(input_row); - input_row += stride; - __m128i lo = _mm_unpacklo_epi16(src, one); - __m128i hi = _mm_unpackhi_epi16(src, one); - lo = _mm_madd_epi16(lo, scale_rounding); - hi = _mm_madd_epi16(hi, scale_rounding); - lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); - hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); - out[i] = _mm_packs_epi32(lo, hi); - } - } else { - const __m128i rect_scale = - _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); - for (int i = 0; i < height; ++i) { - __m128i src = load_32bit_to_16bit(input_row); - src = _mm_mulhrs_epi16(src, rect_scale); - input_row += stride; - __m128i lo = _mm_unpacklo_epi16(src, one); - __m128i hi = _mm_unpackhi_epi16(src, one); - lo = _mm_madd_epi16(lo, scale_rounding); - hi = _mm_madd_epi16(hi, scale_rounding); - lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); - hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); - out[i] = _mm_packs_epi32(lo, hi); - } - } -} - -static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride, - __m128i *buf, int shift, int height, - int txh_idx) { - const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); - const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); - const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); - const __m128i one = _mm_set1_epi16(1); - const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); - const __m128i zero = _mm_setzero_si128(); - for (int h = 0; h < height; ++h) { - __m128i lo = _mm_unpacklo_epi16(buf[h], one); - __m128i hi = _mm_unpackhi_epi16(buf[h], one); - lo = _mm_madd_epi16(lo, scale_coeff); - hi = _mm_madd_epi16(hi, scale_coeff); - lo = _mm_srai_epi32(lo, NewSqrt2Bits); - hi = _mm_srai_epi32(hi, NewSqrt2Bits); - lo = _mm_add_epi32(lo, shift_rounding); - hi = _mm_add_epi32(hi, shift_rounding); - lo = _mm_srai_epi32(lo, -shift); - hi = _mm_srai_epi32(hi, -shift); - __m128i x = _mm_packs_epi32(lo, hi); - - const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); - x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); - const __m128i u = _mm_packus_epi16(x, x); - _mm_storel_epi64((__m128i *)(output), u); - output += stride; - } -} - -static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, - uint8_t *output, int stride, - TX_SIZE tx_size) { - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int input_stride = AOMMIN(32, txfm_size_col); - const int row_max = AOMMIN(32, txfm_size_row); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - __m128i buf[32]; - - for (int i = 0; i < (input_stride >> 3); ++i) { - iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max, - txw_idx, rect_type); - iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max, - txh_idx); - } -} - -void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { - (void)tx_size_; - (void)eob; - __m128i buf[4]; - const TX_SIZE tx_size = TX_4X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); - transpose_16bit_4x4(buf, buf); - row_txfm(buf, buf, cos_bit_row); - if (lr_flip) { - __m128i temp[4]; - flip_buf_sse2(buf, temp, txfm_size_col); - transpose_16bit_4x4(temp, buf); - } else { - transpose_16bit_4x4(buf, buf); - } - col_txfm(buf, buf, cos_bit_col); - round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); - lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); -} - -static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, - __m128i res0, __m128i res1) { - const __m128i zero = _mm_setzero_si128(); - __m128i x0 = _mm_unpacklo_epi8(pred, zero); - __m128i x1 = _mm_unpackhi_epi8(pred, zero); - x0 = _mm_adds_epi16(res0, x0); - x1 = _mm_adds_epi16(res1, x1); - return _mm_packus_epi16(x0, x1); -} - -static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, - int stride, int flipud, - int height) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - for (int i = 0; i < height; ++i, j += step) { - __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); - __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); - _mm_storeu_si128((__m128i *)(output + i * stride), u); - } -} - -static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, - int size) { - const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); - for (int i = 0; i < size; ++i) { - output[i] = _mm_mulhrs_epi16(input[i], scale); - } -} - -static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - __m128i buf1[64 * 8]; - int eobx, eoby; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = txfm_size_col >> 3; - const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_nonzero_h_div8; i++) { - __m128i buf0[64]; - const int32_t *input_row = input + i * input_stride * 8; - for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { - __m128i *buf0_cur = buf0 + j * 8; - load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8); - transpose_16bit_8x8(buf0_cur, buf0_cur); - } - if (rect_type == 1 || rect_type == -1) { - round_shift_ssse3(buf0, buf0, input_stride); // rect special code - } - row_txfm(buf0, buf0, cos_bit_row); - round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); - __m128i *_buf1 = buf1 + i * 8; - if (lr_flip) { - for (int j = 0; j < buf_size_w_div8; ++j) { - __m128i temp[8]; - flip_buf_sse2(buf0 + 8 * j, temp, 8); - transpose_16bit_8x8(temp, - _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); - } - } else { - for (int j = 0; j < buf_size_w_div8; ++j) { - transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); - } - } - } - for (int i = 0; i < buf_size_w_div8; i++) { - col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col); - round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); - } - - if (txfm_size_col >= 16) { - for (int i = 0; i < (txfm_size_col >> 4); i++) { - lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, - output + 16 * i, stride, ud_flip, - txfm_size_row); - } - } else if (txfm_size_col == 8) { - lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); - } -} - -static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - int eobx, eoby; - get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = (eobx + 8) >> 3; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; - assert(fun_idx < 5); - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; - - assert(col_txfm != NULL); - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_w_div8; i++) { - __m128i buf0[64]; - iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0], - eoby + 1, txw_idx, rect_type); - col_txfm(buf0, buf0, cos_bit_col); - __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); - int k = ud_flip ? (txfm_size_row - 1) : 0; - const int step = ud_flip ? -1 : 1; - uint8_t *out = output + 8 * i; - for (int j = 0; j < txfm_size_row; ++j, k += step) { - const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); - __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); - const __m128i u = lowbd_get_recon_8x8_sse2(v, res); - _mm_storel_epi64((__m128i *)(out), u); - out += stride; - } - } -} - -static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - __m128i buf1[64]; - int eobx, eoby; - get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = txfm_size_col >> 3; - const int buf_size_h_div8 = (eoby + 8) >> 3; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; - - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - for (int i = 0; i < buf_size_h_div8; i++) { - __m128i buf0[64]; - const int32_t *input_row = input + i * input_stride * 8; - for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) { - __m128i *buf0_cur = buf0 + j * 8; - load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8); - transpose_16bit_8x8(buf0_cur, buf0_cur); - } - if (rect_type == 1 || rect_type == -1) { - round_shift_ssse3(buf0, buf0, input_stride); // rect special code - } - row_txfm(buf0, buf0, cos_bit_row); - round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); - __m128i *_buf1 = buf1; - if (lr_flip) { - for (int j = 0; j < buf_size_w_div8; ++j) { - __m128i temp[8]; - flip_buf_sse2(buf0 + 8 * j, temp, 8); - transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); - } - } else { - for (int j = 0; j < buf_size_w_div8; ++j) { - transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); - } - } - - for (int j = 0; j < buf_size_w_div8; ++j) { - iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, - buf1 + j * 8, shift[1], 8, txh_idx); - } - } -} - -// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 -static INLINE void lowbd_inv_txfm2d_add_universe_ssse3( - const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - switch (tx_type) { - case DCT_DCT: - lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, - tx_size, eob); - break; - case IDTX: - lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); - break; - case V_DCT: - case V_ADST: - case V_FLIPADST: - lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, - tx_size, eob); - break; - case H_DCT: - case H_ADST: - case H_FLIPADST: - lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, - tx_size, eob); - break; - default: - lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, - tx_size, eob); - break; - } -} - -void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { - (void)tx_size_; - (void)eob; - __m128i buf[8]; - const TX_SIZE tx_size = TX_4X8; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row); - transpose_16bit_4x8(buf, buf); - round_shift_ssse3(buf, buf, txfm_size_col); // rect special code - row_txfm(buf, buf, cos_bit_row); - // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 - if (lr_flip) { - __m128i temp[4]; - flip_buf_sse2(buf, temp, txfm_size_col); - transpose_16bit_8x4(temp, buf); - } else { - transpose_16bit_8x4(buf, buf); - } - col_txfm(buf, buf, cos_bit_col); - round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); - lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); -} - -void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { - (void)tx_size_; - (void)eob; - __m128i buf[8]; - const TX_SIZE tx_size = TX_8X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row); - transpose_16bit_8x4(buf, buf); - round_shift_ssse3(buf, buf, txfm_size_col); // rect special code - row_txfm(buf, buf, cos_bit_row); - // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 - if (lr_flip) { - __m128i temp[8]; - flip_buf_sse2(buf, temp, txfm_size_col); - transpose_16bit_4x8(temp, buf); - } else { - transpose_16bit_4x8(buf, buf); - } - col_txfm(buf, buf, cos_bit_col); - round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); - lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); -} - -void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { - (void)tx_size_; - (void)eob; - __m128i buf[16]; - const TX_SIZE tx_size = TX_4X16; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - - const int row_one_loop = 8; - for (int i = 0; i < 2; ++i) { - const int32_t *input_cur = input + i * txfm_size_col * row_one_loop; - __m128i *buf_cur = buf + i * row_one_loop; - load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur, - row_one_loop); - transpose_16bit_4x8(buf_cur, buf_cur); - row_txfm(buf_cur, buf_cur, cos_bit_row); - round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); - if (lr_flip) { - __m128i temp[8]; - flip_buf_sse2(buf_cur, temp, txfm_size_col); - transpose_16bit_8x4(temp, buf_cur); - } else { - transpose_16bit_8x4(buf_cur, buf_cur); - } - } - col_txfm(buf, buf, cos_bit_col); - round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); - lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); -} - -void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size_, int eob) { - (void)tx_size_; - (void)eob; - __m128i buf[16]; - const TX_SIZE tx_size = TX_16X4; - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx]; - const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx]; - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = txfm_size_col >> 3; - - const transform_1d_ssse3 row_txfm = - lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; - const transform_1d_ssse3 col_txfm = - lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; - - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - const int row_one_loop = 8; - for (int i = 0; i < buf_size_w_div8; ++i) { - const int32_t *input_cur = input + i * row_one_loop; - __m128i *buf_cur = buf + i * row_one_loop; - load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur, - txfm_size_row); - transpose_16bit_8x4(buf_cur, buf_cur); - } - row_txfm(buf, buf, cos_bit_row); - round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); - if (lr_flip) { - __m128i temp[16]; - flip_buf_sse2(buf, temp, 16); - transpose_16bit_4x8(temp, buf); - transpose_16bit_4x8(temp + 8, buf + 8); - } else { - transpose_16bit_4x8(buf, buf); - transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); - } - for (int i = 0; i < buf_size_w_div8; i++) { - col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col); - round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); - } - lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); - lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); -} - -void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob) { - switch (tx_size) { - case TX_4X4: - lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - case TX_4X8: - lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - case TX_8X4: - lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - case TX_4X16: - lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - case TX_16X4: - lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, - eob); - break; - default: - lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, - tx_size, eob); - break; - } -} -void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, - const TxfmParam *txfm_param) { - const TX_TYPE tx_type = txfm_param->tx_type; - if (!txfm_param->lossless) { - av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, - txfm_param->tx_size, txfm_param->eob); - } else { - av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); - } -} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h deleted file mode 100644 index 66bd339d1..000000000 --- a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ -#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ - -#include <emmintrin.h> // SSE2 -#include <tmmintrin.h> // SSSE3 - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/x86/transpose_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define btf_16_ssse3(w0, w1, in, out0, out1) \ - do { \ - const __m128i _w0 = _mm_set1_epi16(w0 * 8); \ - const __m128i _w1 = _mm_set1_epi16(w1 * 8); \ - const __m128i _in = in; \ - out0 = _mm_mulhrs_epi16(_in, _w0); \ - out1 = _mm_mulhrs_epi16(_in, _w1); \ - } while (0) - -#define btf_16_adds_subs_sse2(in0, in1) \ - do { \ - const __m128i _in0 = in0; \ - const __m128i _in1 = in1; \ - in0 = _mm_adds_epi16(_in0, _in1); \ - in1 = _mm_subs_epi16(_in0, _in1); \ - } while (0) - -#define btf_16_subs_adds_sse2(in0, in1) \ - do { \ - const __m128i _in0 = in0; \ - const __m128i _in1 = in1; \ - in1 = _mm_subs_epi16(_in0, _in1); \ - in0 = _mm_adds_epi16(_in0, _in1); \ - } while (0) - -#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \ - do { \ - const __m128i _in0 = in0; \ - const __m128i _in1 = in1; \ - out0 = _mm_adds_epi16(_in0, _in1); \ - out1 = _mm_subs_epi16(_in0, _in1); \ - } while (0) - -static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { - if (bit < 0) { - const __m128i scale = _mm_set1_epi16(1 << (15 + bit)); - for (int i = 0; i < size; ++i) { - in[i] = _mm_mulhrs_epi16(in[i], scale); - } - } else if (bit > 0) { - for (int i = 0; i < size; ++i) { - in[i] = _mm_slli_epi16(in[i], bit); - } - } -} - -// 1D itx types -typedef enum ATTRIBUTE_PACKED { - IDCT_1D, - IADST_1D, - IFLIPADST_1D = IADST_1D, - IIDENTITY_1D, - ITX_TYPES_1D, -} ITX_TYPE_1D; - -static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { - IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, - IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, - IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, - IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, -}; - -static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { - IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, - IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, - IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, - IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { - 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, -}; - -DECLARE_ALIGNED(16, static const int16_t, - av1_eob_to_eobxy_16x16_default[16]) = { - 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, - 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, -}; - -DECLARE_ALIGNED(16, static const int16_t, - av1_eob_to_eobxy_32x32_default[32]) = { - 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, - 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, - 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, - 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { - 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, - 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { - 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, -}; - -DECLARE_ALIGNED(16, static const int16_t, - av1_eob_to_eobxy_16x32_default[32]) = { - 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, - 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, - 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, - 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, -}; - -DECLARE_ALIGNED(16, static const int16_t, - av1_eob_to_eobxy_32x16_default[16]) = { - 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, - 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { - 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, - 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, - 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, - 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, -}; - -DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { - 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, -}; - -DECLARE_ALIGNED(16, static const int16_t *, - av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { - NULL, - av1_eob_to_eobxy_8x8_default, - av1_eob_to_eobxy_16x16_default, - av1_eob_to_eobxy_32x32_default, - av1_eob_to_eobxy_32x32_default, - NULL, - NULL, - av1_eob_to_eobxy_8x16_default, - av1_eob_to_eobxy_16x8_default, - av1_eob_to_eobxy_16x32_default, - av1_eob_to_eobxy_32x16_default, - av1_eob_to_eobxy_32x32_default, - av1_eob_to_eobxy_32x32_default, - NULL, - NULL, - av1_eob_to_eobxy_8x32_default, - av1_eob_to_eobxy_32x8_default, - av1_eob_to_eobxy_16x32_default, - av1_eob_to_eobxy_32x16_default, -}; - -static const int lowbd_txfm_all_1d_zeros_idx[32] = { - 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -}; - -// Transform block width in log2 for eob (size of 64 map to 32) -static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { - 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, -}; - -static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, - TX_SIZE tx_size, int eob) { - if (eob == 1) { - *eobx = 0; - *eoby = 0; - return; - } - - const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; - const int eob_row = (eob - 1) >> tx_w_log2; - const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; - *eobx = eobxy & 0xFF; - *eoby = eobxy >> 8; -} - -static int eob_fill[32] = { - 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, - 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, -}; - -static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, - TX_SIZE tx_size, int eob) { - eob -= 1; - const int txfm_size_col = tx_size_wide[tx_size]; - const int eobx_max = AOMMIN(32, txfm_size_col) - 1; - *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; - const int temp_eoby = eob / (eobx_max + 1); - assert(temp_eoby < 32); - *eoby = eob_fill[temp_eoby]; -} - -static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, - TX_SIZE tx_size, int eob) { - eob -= 1; - const int txfm_size_row = tx_size_high[tx_size]; - const int eoby_max = AOMMIN(32, txfm_size_row) - 1; - *eobx = eob / (eoby_max + 1); - *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; -} - -typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output, - int8_t cos_bit); - -void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob); -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h deleted file mode 100644 index 77aeb6eb1..000000000 --- a/third_party/aom/av1/common/x86/av1_txfm_sse2.h +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ -#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ - -#include <emmintrin.h> // SSE2 - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/x86/transpose_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" -#include "av1/common/av1_txfm.h" - -#ifdef __cplusplus -extern "C" { -#endif - -static INLINE void btf_16_w4_sse2( - const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, - const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, - __m128i *const out0, __m128i *const out1) { - const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); - const __m128i u0 = _mm_madd_epi16(t0, *w0); - const __m128i v0 = _mm_madd_epi16(t0, *w1); - const __m128i a0 = _mm_add_epi32(u0, __rounding); - const __m128i b0 = _mm_add_epi32(v0, __rounding); - const __m128i c0 = _mm_srai_epi32(a0, cos_bit); - const __m128i d0 = _mm_srai_epi32(b0, cos_bit); - - *out0 = _mm_packs_epi32(c0, c0); - *out1 = _mm_packs_epi32(d0, c0); -} - -#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ - { \ - __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ - __m128i u0 = _mm_madd_epi16(t0, w0); \ - __m128i v0 = _mm_madd_epi16(t0, w1); \ - \ - __m128i a0 = _mm_add_epi32(u0, __rounding); \ - __m128i b0 = _mm_add_epi32(v0, __rounding); \ - \ - __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ - __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ - \ - out0 = _mm_packs_epi32(c0, c0); \ - out1 = _mm_packs_epi32(d0, d0); \ - } - -#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ - { \ - __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ - __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ - __m128i u0 = _mm_madd_epi16(t0, w0); \ - __m128i u1 = _mm_madd_epi16(t1, w0); \ - __m128i v0 = _mm_madd_epi16(t0, w1); \ - __m128i v1 = _mm_madd_epi16(t1, w1); \ - \ - __m128i a0 = _mm_add_epi32(u0, __rounding); \ - __m128i a1 = _mm_add_epi32(u1, __rounding); \ - __m128i b0 = _mm_add_epi32(v0, __rounding); \ - __m128i b1 = _mm_add_epi32(v1, __rounding); \ - \ - __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ - __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ - __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ - __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ - \ - out0 = _mm_packs_epi32(c0, c1); \ - out1 = _mm_packs_epi32(d0, d1); \ - } - -static INLINE __m128i load_16bit_to_16bit(const int16_t *a) { - return _mm_load_si128((const __m128i *)a); -} - -static INLINE __m128i load_32bit_to_16bit(const int32_t *a) { - const __m128i a_low = _mm_load_si128((const __m128i *)a); - return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); -} - -static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) { - const __m128i a_low = _mm_load_si128((const __m128i *)a); - return _mm_packs_epi32(a_low, a_low); -} - -// Store 4 16 bit values. Sign extend the values. -static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { - const __m128i a_lo = _mm_unpacklo_epi16(a, a); - const __m128i a_1 = _mm_srai_epi32(a_lo, 16); - _mm_store_si128((__m128i *)b, a_1); -} - -// Store 8 16 bit values. Sign extend the values. -static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) { - const __m128i a_lo = _mm_unpacklo_epi16(a, a); - const __m128i a_hi = _mm_unpackhi_epi16(a, a); - const __m128i a_1 = _mm_srai_epi32(a_lo, 16); - const __m128i a_2 = _mm_srai_epi32(a_hi, 16); - _mm_store_si128((__m128i *)b, a_1); - _mm_store_si128((__m128i *)(b + 4), a_2); -} - -static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) { - const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); - const __m128i b = _mm_madd_epi16(a, scale_rounding); - return _mm_srai_epi32(b, NewSqrt2Bits); -} - -static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a, - int32_t *const b) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a_lo = _mm_unpacklo_epi16(a, one); - const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); - _mm_store_si128((__m128i *)b, b_lo); -} - -static INLINE void store_rect_16bit_to_32bit(const __m128i a, - int32_t *const b) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a_lo = _mm_unpacklo_epi16(a, one); - const __m128i a_hi = _mm_unpackhi_epi16(a, one); - const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); - const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); - _mm_store_si128((__m128i *)b, b_lo); - _mm_store_si128((__m128i *)(b + 4), b_hi); -} - -static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in, - const int stride, - __m128i *const out, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); - } -} - -static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, - const int stride, - __m128i *const out, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); - } -} - -static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride, - __m128i *out, int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_16bit_to_16bit(in + i * stride); - } -} - -static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in, - int stride, __m128i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); - } -} - -static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, - __m128i *out, int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_32bit_to_16bit(in + i * stride); - } -} - -static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, - __m128i *out, int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = load_32bit_to_16bit_w4(in + i * stride); - } -} - -static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in, - int stride, __m128i *out, - int out_size) { - for (int i = 0; i < out_size; ++i) { - out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); - } -} - -static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in, - int32_t *const out, - const int stride, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - store_16bit_to_32bit_w4(in[i], out + i * stride); - } -} - -static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in, - int32_t *const out, - const int stride, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - store_16bit_to_32bit(in[i], out + i * stride); - } -} - -static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, - int32_t *const out, - const int stride, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - store_rect_16bit_to_32bit_w4(in[i], out + i * stride); - } -} - -static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, - int32_t *const out, - const int stride, - const int out_size) { - for (int i = 0; i < out_size; ++i) { - store_rect_16bit_to_32bit(in[i], out + i * stride); - } -} - -static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in, - uint16_t *out, - const int stride) { - for (int i = 0; i < 8; ++i) { - _mm_store_si128((__m128i *)(out + i * stride), in[i]); - } -} - -static INLINE void round_shift_16bit(__m128i *in, int size, int bit) { - if (bit < 0) { - bit = -bit; - __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); - for (int i = 0; i < size; ++i) { - in[i] = _mm_adds_epi16(in[i], rounding); - in[i] = _mm_srai_epi16(in[i], bit); - } - } else if (bit > 0) { - for (int i = 0; i < size; ++i) { - in[i] = _mm_slli_epi16(in[i], bit); - } - } -} - -static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) { - for (int i = 0; i < size; ++i) { - out[size - i - 1] = in[i]; - } -} - -void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, - int stride, TX_TYPE tx_type, int bd); - -typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, - int8_t cos_bit); - -typedef struct { - transform_1d_sse2 col, row; // vertical and horizontal -} transform_2d_sse2; - -#ifdef __cplusplus -} -#endif // __cplusplus -#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c deleted file mode 100644 index 90b9879cc..000000000 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "config/aom_dsp_rtcd.h" - -#include "av1/common/av1_txfm.h" -#include "av1/common/x86/av1_txfm_sse4.h" - -void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) { - __m128i *const vec = (__m128i *)arr; - const int vec_size = size >> 2; - av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit); -} diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h deleted file mode 100644 index 6cad821b1..000000000 --- a/third_party/aom/av1/common/x86/av1_txfm_sse4.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ -#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ - -#include <smmintrin.h> - -#ifdef __cplusplus -extern "C" { -#endif - -static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) { - __m128i tmp, round; - round = _mm_set1_epi32(1 << (bit - 1)); - tmp = _mm_add_epi32(vec, round); - return _mm_srai_epi32(tmp, bit); -} - -static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input, - __m128i *output, - const int size, - const int bit) { - if (bit > 0) { - int i; - for (i = 0; i < size; i++) { - output[i] = av1_round_shift_32_sse4_1(input[i], bit); - } - } else { - int i; - for (i = 0; i < size; i++) { - output[i] = _mm_slli_epi32(input[i], -bit); - } - } -} - -static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input, - __m128i *output, - const int size, - const int bit, - const int val) { - const __m128i sqrt2 = _mm_set1_epi32(val); - if (bit > 0) { - int i; - for (i = 0; i < size; i++) { - const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit); - const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); - output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); - } - } else { - int i; - for (i = 0; i < size; i++) { - const __m128i r0 = _mm_slli_epi32(input[i], -bit); - const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); - output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); - } - } -} - -#ifdef __cplusplus -} -#endif - -#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c deleted file mode 100644 index a8bfdcce6..000000000 --- a/third_party/aom/av1/common/x86/cfl_avx2.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <immintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/cfl.h" - -#include "av1/common/x86/cfl_simd.h" - -#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ - cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ - TX_SIZE tx_size) { \ - static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ - subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ - subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ - subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ - subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ - cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ - subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ - subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ - subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ - subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ - subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ - cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ - subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ - subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ - subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ - cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \ - cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \ - }; \ - return subfn_##sub[tx_size]; \ - } - -/** - * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more - * precise version of a box filter 4:2:0 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - * - * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. - */ -static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos - const int luma_stride = input_stride << 1; - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); - - __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); - __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); - __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); - - _mm256_storeu_si256(row, sum_16x16); - - input += luma_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) - -/** - * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more - * precise version of a box filter 4:2:2 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); - _mm256_storeu_si256(row, top_16x16); - input += input_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) - -/** - * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only - * performed on block of width 32. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - const __m256i zeros = _mm256_setzero_si256(); - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); - - __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); - row_lo = _mm256_slli_epi16(row_lo, 3); - __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); - row_hi = _mm256_slli_epi16(row_hi, 3); - - _mm256_storeu_si256(row, row_lo); - _mm256_storeu_si256(row + 1, row_hi); - - input += input_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) - -/** - * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more - * precise version of a box filter 4:2:0 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - * - * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. - */ -static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - const int luma_stride = input_stride << 1; - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); - __m256i sum = _mm256_add_epi16(top, bot); - - __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); - __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); - __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); - - __m256i hsum = _mm256_hadd_epi16(sum, sum_1); - hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); - hsum = _mm256_add_epi16(hsum, hsum); - - _mm256_storeu_si256(row, hsum); - - input += luma_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) - -/** - * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more - * precise version of a box filter 4:2:2 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - * - */ -static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); - __m256i hsum = _mm256_hadd_epi16(top, top_1); - hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); - hsum = _mm256_slli_epi16(hsum, 2); - - _mm256_storeu_si256(row, hsum); - - input += input_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) - -static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, int width, - int height) { - (void)width; // Forever 32 - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - do { - __m256i top = _mm256_loadu_si256((__m256i *)input); - __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); - _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); - _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); - input += input_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) - -static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, - __m256i alpha_sign, __m256i dc_q0) { - __m256i ac_q3 = _mm256_loadu_si256(input); - __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); - __m256i scaled_luma_q0 = - _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); - scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); - return _mm256_add_epi16(scaled_luma_q0, dc_q0); -} - -static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, - uint8_t *dst, int dst_stride, - int alpha_q3, int width, int height) { - (void)width; - const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); - const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); - const __m256i dc_q0 = _mm256_set1_epi16(*dst); - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - - do { - __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); - __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); - res = _mm256_packus_epi16(res, next); - res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_storeu_si256((__m256i *)dst, res); - dst += dst_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_PREDICT_X(avx2, 32, 8, lbd); -CFL_PREDICT_X(avx2, 32, 16, lbd); -CFL_PREDICT_X(avx2, 32, 32, lbd); - -cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) { - static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { - predict_lbd_4x4_ssse3, /* 4x4 */ - predict_lbd_8x8_ssse3, /* 8x8 */ - predict_lbd_16x16_ssse3, /* 16x16 */ - predict_lbd_32x32_avx2, /* 32x32 */ - cfl_predict_lbd_null, /* 64x64 (invalid CFL size) */ - predict_lbd_4x8_ssse3, /* 4x8 */ - predict_lbd_8x4_ssse3, /* 8x4 */ - predict_lbd_8x16_ssse3, /* 8x16 */ - predict_lbd_16x8_ssse3, /* 16x8 */ - predict_lbd_16x32_ssse3, /* 16x32 */ - predict_lbd_32x16_avx2, /* 32x16 */ - cfl_predict_lbd_null, /* 32x64 (invalid CFL size) */ - cfl_predict_lbd_null, /* 64x32 (invalid CFL size) */ - predict_lbd_4x16_ssse3, /* 4x16 */ - predict_lbd_16x4_ssse3, /* 16x4 */ - predict_lbd_8x32_ssse3, /* 8x32 */ - predict_lbd_32x8_avx2, /* 32x8 */ - cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */ - cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */ - }; - // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the - // function pointer array out of bounds. - return pred[tx_size % TX_SIZES_ALL]; -} - -static __m256i highbd_max_epi16(int bd) { - const __m256i neg_one = _mm256_set1_epi16(-1); - // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) - return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); -} - -static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { - return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); -} - -static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, - uint16_t *dst, int dst_stride, - int alpha_q3, int bd, int width, - int height) { - // Use SSSE3 version for smaller widths - assert(width == 16 || width == 32); - const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); - const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); - const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); - const __m256i max = highbd_max_epi16(bd); - - __m256i *row = (__m256i *)pred_buf_q3; - const __m256i *row_end = row + height * CFL_BUF_LINE_I256; - do { - const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); - _mm256_storeu_si256((__m256i *)dst, - highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); - if (width == 32) { - const __m256i res_1 = - predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); - _mm256_storeu_si256( - (__m256i *)(dst + 16), - highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); - } - dst += dst_stride; - } while ((row += CFL_BUF_LINE_I256) < row_end); -} - -CFL_PREDICT_X(avx2, 16, 4, hbd) -CFL_PREDICT_X(avx2, 16, 8, hbd) -CFL_PREDICT_X(avx2, 16, 16, hbd) -CFL_PREDICT_X(avx2, 16, 32, hbd) -CFL_PREDICT_X(avx2, 32, 8, hbd) -CFL_PREDICT_X(avx2, 32, 16, hbd) -CFL_PREDICT_X(avx2, 32, 32, hbd) - -cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) { - static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { - predict_hbd_4x4_ssse3, /* 4x4 */ - predict_hbd_8x8_ssse3, /* 8x8 */ - predict_hbd_16x16_avx2, /* 16x16 */ - predict_hbd_32x32_avx2, /* 32x32 */ - cfl_predict_hbd_null, /* 64x64 (invalid CFL size) */ - predict_hbd_4x8_ssse3, /* 4x8 */ - predict_hbd_8x4_ssse3, /* 8x4 */ - predict_hbd_8x16_ssse3, /* 8x16 */ - predict_hbd_16x8_avx2, /* 16x8 */ - predict_hbd_16x32_avx2, /* 16x32 */ - predict_hbd_32x16_avx2, /* 32x16 */ - cfl_predict_hbd_null, /* 32x64 (invalid CFL size) */ - cfl_predict_hbd_null, /* 64x32 (invalid CFL size) */ - predict_hbd_4x16_ssse3, /* 4x16 */ - predict_hbd_16x4_avx2, /* 16x4 */ - predict_hbd_8x32_ssse3, /* 8x32 */ - predict_hbd_32x8_avx2, /* 32x8 */ - cfl_predict_hbd_null, /* 16x64 (invalid CFL size) */ - cfl_predict_hbd_null, /* 64x16 (invalid CFL size) */ - }; - // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the - // function pointer array out of bounds. - return pred[tx_size % TX_SIZES_ALL]; -} - -// Returns a vector where all the (32-bits) elements are the sum of all the -// lanes in a. -static INLINE __m256i fill_sum_epi32(__m256i a) { - // Given that a == [A, B, C, D, E, F, G, H] - a = _mm256_hadd_epi32(a, a); - // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H - // a == [A', C', A', C', E', G', E', G'] - a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); - // a == [A', C', E', G', A', C', E', G'] - a = _mm256_hadd_epi32(a, a); - // Given that A'' == A' + C' and E'' == E' + G' - // a == [A'', E'', A'', E'', A'', E'', A'', E''] - return _mm256_hadd_epi32(a, a); - // Given that A''' == A'' + E'' - // a == [A''', A''', A''', A''', A''', A''', A''', A'''] -} - -static INLINE __m256i _mm256_addl_epi16(__m256i a) { - return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), - _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); -} - -static INLINE void subtract_average_avx2(const uint16_t *src_ptr, - int16_t *dst_ptr, int width, - int height, int round_offset, - int num_pel_log2) { - // Use SSE2 version for smaller widths - assert(width == 16 || width == 32); - - const __m256i *src = (__m256i *)src_ptr; - const __m256i *const end = src + height * CFL_BUF_LINE_I256; - // To maximize usage of the AVX2 registers, we sum two rows per loop - // iteration - const int step = 2 * CFL_BUF_LINE_I256; - - __m256i sum = _mm256_setzero_si256(); - // For width 32, we use a second sum accumulator to reduce accumulator - // dependencies in the loop. - __m256i sum2; - if (width == 32) sum2 = _mm256_setzero_si256(); - - do { - // Add top row to the bottom row - __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), - _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); - sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); - if (width == 32) { /* Don't worry, this if it gets optimized out. */ - // Add the second part of the top row to the second part of the bottom row - __m256i l1 = - _mm256_add_epi16(_mm256_loadu_si256(src + 1), - _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); - sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); - } - src += step; - } while (src < end); - // Combine both sum accumulators - if (width == 32) sum = _mm256_add_epi32(sum, sum2); - - __m256i fill = fill_sum_epi32(sum); - - __m256i avg_epi16 = _mm256_srli_epi32( - _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); - avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); - - // Store and subtract loop - src = (__m256i *)src_ptr; - __m256i *dst = (__m256i *)dst_ptr; - do { - _mm256_storeu_si256(dst, - _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); - if (width == 32) { - _mm256_storeu_si256( - dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); - } - src += CFL_BUF_LINE_I256; - dst += CFL_BUF_LINE_I256; - } while (src < end); -} - -// Declare wrappers for AVX2 sizes -CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) -CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) -CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) -CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) -CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) -CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) -CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) - -// Based on the observation that for small blocks AVX2 does not outperform -// SSE2, we call the SSE2 code for block widths 4 and 8. -cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) { - static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { - subtract_average_4x4_sse2, /* 4x4 */ - subtract_average_8x8_sse2, /* 8x8 */ - subtract_average_16x16_avx2, /* 16x16 */ - subtract_average_32x32_avx2, /* 32x32 */ - cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ - subtract_average_4x8_sse2, /* 4x8 */ - subtract_average_8x4_sse2, /* 8x4 */ - subtract_average_8x16_sse2, /* 8x16 */ - subtract_average_16x8_avx2, /* 16x8 */ - subtract_average_16x32_avx2, /* 16x32 */ - subtract_average_32x16_avx2, /* 32x16 */ - cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ - subtract_average_4x16_sse2, /* 4x16 */ - subtract_average_16x4_avx2, /* 16x4 */ - subtract_average_8x32_sse2, /* 8x32 */ - subtract_average_32x8_avx2, /* 32x8 */ - cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ - cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ - }; - // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to - // index the function pointer array out of bounds. - return sub_avg[tx_size % TX_SIZES_ALL]; -} diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h deleted file mode 100644 index 3b342cd4e..000000000 --- a/third_party/aom/av1/common/x86/cfl_simd.h +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ -#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ - -#include "av1/common/blockd.h" - -// SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); - -// SSE2 version is optimal for with == 4, we reuse them in AVX2 -void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); - -// SSE2 version is optimal for with == 8, we reuse them in AVX2 -void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); - -void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); - -void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); - -#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c deleted file mode 100644 index 4783fe098..000000000 --- a/third_party/aom/av1/common/x86/cfl_sse2.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "av1/common/cfl.h" -#include "config/av1_rtcd.h" - -static INLINE __m128i fill_sum_epi32(__m128i l0) { - l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); - return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); -} - -static INLINE void subtract_average_sse2(const uint16_t *src_ptr, - int16_t *dst_ptr, int width, - int height, int round_offset, - int num_pel_log2) { - const __m128i zeros = _mm_setzero_si128(); - const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); - const __m128i *src = (__m128i *)src_ptr; - const __m128i *const end = src + height * CFL_BUF_LINE_I128; - const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); - - __m128i sum = zeros; - do { - __m128i l0; - if (width == 4) { - l0 = _mm_add_epi16(_mm_loadl_epi64(src), - _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); - __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), - _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); - sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), - _mm_unpacklo_epi16(l1, zeros))); - } else { - if (width == 8) { - l0 = _mm_add_epi16(_mm_loadu_si128(src), - _mm_loadu_si128(src + CFL_BUF_LINE_I128)); - } else { - l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); - } - sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), - _mm_unpackhi_epi16(l0, zeros))); - if (width == 32) { - l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); - sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), - _mm_unpackhi_epi16(l0, zeros))); - } - } - src += step; - } while (src < end); - - sum = fill_sum_epi32(sum); - - __m128i avg_epi16 = - _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); - avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); - - src = (__m128i *)src_ptr; - __m128i *dst = (__m128i *)dst_ptr; - do { - if (width == 4) { - _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); - } else { - _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); - if (width > 8) { - _mm_storeu_si128(dst + 1, - _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); - if (width == 32) { - _mm_storeu_si128(dst + 2, - _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); - _mm_storeu_si128(dst + 3, - _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); - } - } - } - src += CFL_BUF_LINE_I128; - dst += CFL_BUF_LINE_I128; - } while (src < end); -} - -CFL_SUB_AVG_FN(sse2) diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c deleted file mode 100644 index bbf007295..000000000 --- a/third_party/aom/av1/common/x86/cfl_ssse3.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/cfl.h" - -#include "av1/common/x86/cfl_simd.h" - -// Load 32-bit integer from memory into the first element of dst. -static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) { - return _mm_cvtsi32_si128(*((int *)mem_addr)); -} - -// Store 32-bit integer from the first element of a into memory. -static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { - *((int *)mem_addr) = _mm_cvtsi128_si32(a); -} - -/** - * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more - * precise version of a box filter 4:2:0 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - const __m128i twos = _mm_set1_epi8(2); - __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; - const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; - const int luma_stride = input_stride << 1; - do { - if (width == 4) { - __m128i top = _mm_loadh_epi32((__m128i *)input); - top = _mm_maddubs_epi16(top, twos); - __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); - bot = _mm_maddubs_epi16(bot, twos); - const __m128i sum = _mm_add_epi16(top, bot); - _mm_storeh_epi32(pred_buf_m128i, sum); - } else if (width == 8) { - __m128i top = _mm_loadl_epi64((__m128i *)input); - top = _mm_maddubs_epi16(top, twos); - __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); - bot = _mm_maddubs_epi16(bot, twos); - const __m128i sum = _mm_add_epi16(top, bot); - _mm_storel_epi64(pred_buf_m128i, sum); - } else { - __m128i top = _mm_loadu_si128((__m128i *)input); - top = _mm_maddubs_epi16(top, twos); - __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); - bot = _mm_maddubs_epi16(bot, twos); - const __m128i sum = _mm_add_epi16(top, bot); - _mm_storeu_si128(pred_buf_m128i, sum); - if (width == 32) { - __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); - __m128i bot_1 = - _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); - top_1 = _mm_maddubs_epi16(top_1, twos); - bot_1 = _mm_maddubs_epi16(bot_1, twos); - __m128i sum_1 = _mm_add_epi16(top_1, bot_1); - _mm_storeu_si128(pred_buf_m128i + 1, sum_1); - } - } - input += luma_stride; - pred_buf_m128i += CFL_BUF_LINE_I128; - } while (pred_buf_m128i < end); -} - -/** - * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more - * precise version of a box filter 4:2:2 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - const __m128i fours = _mm_set1_epi8(4); - __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; - const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; - do { - if (width == 4) { - __m128i top = _mm_loadh_epi32((__m128i *)input); - top = _mm_maddubs_epi16(top, fours); - _mm_storeh_epi32(pred_buf_m128i, top); - } else if (width == 8) { - __m128i top = _mm_loadl_epi64((__m128i *)input); - top = _mm_maddubs_epi16(top, fours); - _mm_storel_epi64(pred_buf_m128i, top); - } else { - __m128i top = _mm_loadu_si128((__m128i *)input); - top = _mm_maddubs_epi16(top, fours); - _mm_storeu_si128(pred_buf_m128i, top); - if (width == 32) { - __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); - top_1 = _mm_maddubs_epi16(top_1, fours); - _mm_storeu_si128(pred_buf_m128i + 1, top_1); - } - } - input += input_stride; - pred_buf_m128i += CFL_BUF_LINE_I128; - } while (pred_buf_m128i < end); -} - -/** - * Multiplies the pixels by 8 (scaling in Q3). - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - const __m128i zeros = _mm_setzero_si128(); - const int luma_stride = input_stride; - __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; - const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; - do { - if (width == 4) { - __m128i row = _mm_loadh_epi32((__m128i *)input); - row = _mm_unpacklo_epi8(row, zeros); - _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); - } else if (width == 8) { - __m128i row = _mm_loadl_epi64((__m128i *)input); - row = _mm_unpacklo_epi8(row, zeros); - _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); - } else { - __m128i row = _mm_loadu_si128((__m128i *)input); - const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); - const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); - _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); - _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); - if (width == 32) { - __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); - const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); - const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); - _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); - _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); - } - } - input += luma_stride; - pred_buf_m128i += CFL_BUF_LINE_I128; - } while (pred_buf_m128i < end); -} - -/** - * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more - * precise version of a box filter 4:2:0 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; - const int luma_stride = input_stride << 1; - do { - if (width == 4) { - const __m128i top = _mm_loadl_epi64((__m128i *)input); - const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); - __m128i sum = _mm_add_epi16(top, bot); - sum = _mm_hadd_epi16(sum, sum); - *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); - } else { - const __m128i top = _mm_loadu_si128((__m128i *)input); - const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); - __m128i sum = _mm_add_epi16(top, bot); - if (width == 8) { - sum = _mm_hadd_epi16(sum, sum); - _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); - } else { - const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); - const __m128i bot_1 = - _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); - sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); - _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); - if (width == 32) { - const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); - const __m128i bot_2 = - _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); - const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); - const __m128i bot_3 = - _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); - const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); - const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); - __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); - _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, - _mm_add_epi16(next_sum, next_sum)); - } - } - } - input += luma_stride; - } while ((pred_buf_q3 += CFL_BUF_LINE) < end); -} - -/** - * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more - * precise version of a box filter 4:2:2 pixel subsampling in Q3. - * - * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the - * active area is specified using width and height. - * - * Note: We don't need to worry about going over the active area, as long as we - * stay inside the CfL prediction buffer. - */ -static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; - const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; - do { - if (width == 4) { - const __m128i top = _mm_loadl_epi64((__m128i *)input); - const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); - _mm_storeh_epi32(pred_buf_m128i, sum); - } else { - const __m128i top = _mm_loadu_si128((__m128i *)input); - if (width == 8) { - const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); - _mm_storel_epi64(pred_buf_m128i, sum); - } else { - const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); - const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); - _mm_storeu_si128(pred_buf_m128i, sum); - if (width == 32) { - const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); - const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); - const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); - _mm_storeu_si128(pred_buf_m128i + 1, sum_1); - } - } - } - pred_buf_m128i += CFL_BUF_LINE_I128; - input += input_stride; - } while (pred_buf_m128i < end); -} - -static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, - int input_stride, - uint16_t *pred_buf_q3, - int width, int height) { - const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; - do { - if (width == 4) { - const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); - _mm_storel_epi64((__m128i *)pred_buf_q3, row); - } else { - const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); - _mm_storeu_si128((__m128i *)pred_buf_q3, row); - if (width >= 16) { - __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); - row_1 = _mm_slli_epi16(row_1, 3); - _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); - if (width == 32) { - __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); - row_2 = _mm_slli_epi16(row_2, 3); - _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); - __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); - row_3 = _mm_slli_epi16(row_3, 3); - _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); - } - } - } - input += input_stride; - pred_buf_q3 += CFL_BUF_LINE; - } while (pred_buf_q3 < end); -} - -CFL_GET_SUBSAMPLE_FUNCTION(ssse3) - -static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, - __m128i alpha_sign, __m128i dc_q0) { - __m128i ac_q3 = _mm_loadu_si128(input); - __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); - __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); - scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); - return _mm_add_epi16(scaled_luma_q0, dc_q0); -} - -static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, - uint8_t *dst, int dst_stride, - int alpha_q3, int width, int height) { - const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); - const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); - const __m128i dc_q0 = _mm_set1_epi16(*dst); - __m128i *row = (__m128i *)pred_buf_q3; - const __m128i *row_end = row + height * CFL_BUF_LINE_I128; - do { - __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); - if (width < 16) { - res = _mm_packus_epi16(res, res); - if (width == 4) - _mm_storeh_epi32((__m128i *)dst, res); - else - _mm_storel_epi64((__m128i *)dst, res); - } else { - __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); - res = _mm_packus_epi16(res, next); - _mm_storeu_si128((__m128i *)dst, res); - if (width == 32) { - res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); - next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); - res = _mm_packus_epi16(res, next); - _mm_storeu_si128((__m128i *)(dst + 16), res); - } - } - dst += dst_stride; - } while ((row += CFL_BUF_LINE_I128) < row_end); -} - -CFL_PREDICT_FN(ssse3, lbd) - -static INLINE __m128i highbd_max_epi16(int bd) { - const __m128i neg_one = _mm_set1_epi16(-1); - // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) - return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); -} - -static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { - return _mm_max_epi16(_mm_min_epi16(u, max), zero); -} - -static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, - uint16_t *dst, int dst_stride, - int alpha_q3, int bd, int width, - int height) { - const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); - const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); - const __m128i dc_q0 = _mm_set1_epi16(*dst); - const __m128i max = highbd_max_epi16(bd); - const __m128i zeros = _mm_setzero_si128(); - __m128i *row = (__m128i *)pred_buf_q3; - const __m128i *row_end = row + height * CFL_BUF_LINE_I128; - do { - __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); - res = highbd_clamp_epi16(res, zeros, max); - if (width == 4) { - _mm_storel_epi64((__m128i *)dst, res); - } else { - _mm_storeu_si128((__m128i *)dst, res); - } - if (width >= 16) { - const __m128i res_1 = - predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); - _mm_storeu_si128(((__m128i *)dst) + 1, - highbd_clamp_epi16(res_1, zeros, max)); - } - if (width == 32) { - const __m128i res_2 = - predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); - _mm_storeu_si128((__m128i *)(dst + 16), - highbd_clamp_epi16(res_2, zeros, max)); - const __m128i res_3 = - predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); - _mm_storeu_si128((__m128i *)(dst + 24), - highbd_clamp_epi16(res_3, zeros, max)); - } - dst += dst_stride; - } while ((row += CFL_BUF_LINE_I128) < row_end); -} - -CFL_PREDICT_FN(ssse3, hbd) diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c deleted file mode 100644 index 0acafd044..000000000 --- a/third_party/aom/av1/common/x86/convolve_2d_avx2.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/convolve_common_intrin.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/synonyms.h" -#include "av1/common/convolve.h" - -void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - - __m256i filt[4], coeffs_h[4], coeffs_v[4]; - - assert(conv_params->round_0 > 0); - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v); - - const __m256i round_const_h = _mm256_set1_epi16( - ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); - const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); - - const __m256i sum_round_v = _mm256_set1_epi32( - (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); - const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); - - const __m256i round_const_v = _mm256_set1_epi32( - ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - - ((1 << (offset_bits - conv_params->round_1)) >> 1)); - const __m128i round_shift_v = _mm_cvtsi32_si128(bits); - - for (j = 0; j < w; j += 8) { - for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); - - // Load the next line - if (i + 1 < im_h) - data = _mm256_inserti128_si256( - data, - _mm_loadu_si128( - (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), - 1); - - __m256i res = convolve_lowbd_x(data, coeffs_h, filt); - - res = - _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); - - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - - /* Vertical filter */ - { - __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); - __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); - __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); - __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - - __m256i s[8]; - s[0] = _mm256_unpacklo_epi16(src_0, src_1); - s[1] = _mm256_unpacklo_epi16(src_2, src_3); - s[2] = _mm256_unpacklo_epi16(src_4, src_5); - - s[4] = _mm256_unpackhi_epi16(src_0, src_1); - s[5] = _mm256_unpackhi_epi16(src_2, src_3); - s[6] = _mm256_unpackhi_epi16(src_4, src_5); - - for (i = 0; i < h; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); - - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); - - __m256i res_a = convolve(s, coeffs_v); - __m256i res_b = convolve(s + 4, coeffs_v); - - // Combine V round and 2F-H-V round into a single rounding - res_a = - _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); - res_b = - _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); - - const __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_v), round_shift_v); - const __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_v), round_shift_v); - - /* rounding code */ - // 16 bit conversion - const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); - // 8 bit conversion and saturation to uint8 - const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); - - const __m128i res_0 = _mm256_castsi256_si128(res_8b); - const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); - - // Store values into the destination buffer - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; - if (w - j > 4) { - _mm_storel_epi64(p_0, res_0); - _mm_storel_epi64(p_1, res_1); - } else if (w == 4) { - xx_storel_32(p_0, res_0); - xx_storel_32(p_1, res_1); - } else { - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} - -static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { - __m256i s[4]; - s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); - s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); - s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32)); - s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32)); - _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); - _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]); - _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]); -} - -void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - (void)conv_params; - - if (w >= 16) { - assert(!((intptr_t)dst % 16)); - assert(!(dst_stride % 16)); - } - - if (w == 2) { - do { - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 4) { - do { - memcpy(dst, src, 4 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 8) { - do { - __m128i s[2]; - s[0] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - s[1] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - _mm_storel_epi64((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_storel_epi64((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 16) { - do { - __m128i s[2]; - s[0] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - s[1] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - _mm_store_si128((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_store_si128((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 32) { - do { - __m256i s[2]; - s[0] = _mm256_loadu_si256((__m256i *)src); - src += src_stride; - s[1] = _mm256_loadu_si256((__m256i *)src); - src += src_stride; - _mm256_storeu_si256((__m256i *)dst, s[0]); - dst += dst_stride; - _mm256_storeu_si256((__m256i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 64) { - do { - __m256i s[4]; - s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); - s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); - src += src_stride; - s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); - s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); - src += src_stride; - _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); - dst += dst_stride; - _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]); - dst += dst_stride; - h -= 2; - } while (h); - } else { - do { - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } -} diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c deleted file mode 100644 index b1a62a4f6..000000000 --- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c +++ /dev/null @@ -1,472 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_sse2.h" -#include "av1/common/convolve.h" - -void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - - assert(conv_params->round_0 > 0); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i sum_round = - _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); - const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - - ((1 << (offset_bits - conv_params->round_1)) >> 1)); - const __m128i round_shift = _mm_cvtsi32_si128(bits); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); - __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); - - res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), - round_shift); - res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), - round_shift); - - const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); - const __m128i res = _mm_packus_epi16(res16, res16); - - // Accumulate values into the destination buffer - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - - if (w == 2) { - *(uint16_t *)p = _mm_cvtsi128_si32(res); - } else if (w == 4) { - *(uint32_t *)p = _mm_cvtsi128_si32(res); - } else { - _mm_storel_epi64(p, res); - } - } - } - } -} - -static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { - __m128i s[8]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); - s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); - s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); - s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16)); - s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16)); - s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16)); - s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16)); - _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); - _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); - _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); - _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]); - _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]); - _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]); - _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]); -} - -void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - (void)conv_params; - - if (w >= 16) { - assert(!((intptr_t)dst % 16)); - assert(!(dst_stride % 16)); - } - - if (w == 2) { - do { - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 4) { - do { - memcpy(dst, src, 4 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 8) { - do { - __m128i s[2]; - s[0] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - s[1] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - _mm_storel_epi64((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_storel_epi64((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 16) { - do { - __m128i s[2]; - s[0] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - s[1] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - _mm_store_si128((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_store_si128((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 32) { - do { - __m128i s[4]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); - src += src_stride; - s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); - s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); - src += src_stride; - _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); - dst += dst_stride; - _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]); - _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 64) { - do { - __m128i s[8]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); - s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); - s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); - src += src_stride; - s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); - s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); - s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); - s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); - src += src_stride; - _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); - _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); - _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); - dst += dst_stride; - _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]); - _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]); - _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]); - _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]); - dst += dst_stride; - h -= 2; - } while (h); - } else { - do { - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } -} - -void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - - const int bits = - FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const __m128i zero = _mm_setzero_si128(); - const __m128i left_shift = _mm_cvtsi32_si128(bits); - int i, j; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - - assert((w % 4) == 0); - - if (!(w % 16)) { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); - - const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero); - const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero); - - const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift); - const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const); - - const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift); - const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const); - - if (do_average) { - const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j])); - const __m128i data_ref_0_hi = - _mm_loadu_si128((__m128i *)(&dst[j + 8])); - - const __m128i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg); - - const __m128i round_result_lo = convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - - const __m128i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg); - - const __m128i round_result_hi = convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = - _mm_packus_epi16(round_result_lo, round_result_hi); - - _mm_store_si128((__m128i *)(&dst0[j]), res_8); - } else { - _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo); - _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi); - } - } - src += src_stride; - dst += dst_stride; - dst0 += dst_stride0; - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); - const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); - - const __m128i res = _mm_sll_epi16(d16_0, left_shift); - const __m128i res_unsigned = _mm_add_epi16(res, offset_const); - - if (do_average) { - const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - - if (w > 4) - _mm_storel_epi64((__m128i *)(&dst0[j]), res_8); - else - *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8); - } else { - _mm_store_si128((__m128i *)(&dst[j]), res_unsigned); - } - } - src += src_stride; - dst += dst_stride; - dst0 += dst_stride0; - } - } -} diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c deleted file mode 100644 index 0e91ea947..000000000 --- a/third_party/aom/av1/common/x86/convolve_avx2.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/synonyms.h" - -void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; - - // right shift is F-1 because we are already dividing - // filter co-efficients by 2 - const int right_shift_bits = (FILTER_BITS - 1); - const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); - const __m256i right_shift_const = - _mm256_set1_epi16((1 << right_shift_bits) >> 1); - __m256i coeffs[4], s[8]; - - assert(conv_params->round_0 <= FILTER_BITS); - assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || - ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - - prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); - - (void)filter_params_x; - (void)subpel_x_q4; - (void)conv_params; - - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - - // Load lines a and b. Line a to lower 128, line b to upper 128 - const __m256i src_01a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - - const __m256i src_12a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - - const __m256i src_23a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - - const __m256i src_34a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - - const __m256i src_45a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - const __m256i src_56a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); - s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); - - s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); - s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); - s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - const __m256i src_67a = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); - - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); - - const __m256i res_lo = convolve_lowbd(s, coeffs); - - /* rounding code */ - // shift by F - 1 - const __m256i res_16b_lo = _mm256_sra_epi16( - _mm256_add_epi16(res_lo, right_shift_const), right_shift); - // 8 bit conversion and saturation to uint8 - __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); - - if (w - j > 8) { - const __m256i res_hi = convolve_lowbd(s + 4, coeffs); - - /* rounding code */ - // shift by F - 1 - const __m256i res_16b_hi = _mm256_sra_epi16( - _mm256_add_epi16(res_hi, right_shift_const), right_shift); - // 8 bit conversion and saturation to uint8 - __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); - - __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); - - const __m128i res_0 = _mm256_castsi256_si128(res_a); - const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_1); - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); - const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); - if (w - j > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_1); - } else if (w - j > 2) { - xx_storel_32(&dst[i * dst_stride + j], res_0); - xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); - } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); - } - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } -} - -void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_0; - - __m256i filt[4], coeffs[4]; - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - - const __m256i round_0_const = - _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); - const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); - const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(bits); - - (void)filter_params_y; - (void)subpel_y_q4; - - assert(bits >= 0); - assert((FILTER_BITS - conv_params->round_1) >= 0 || - ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); - assert(conv_params->round_0 > 0); - - if (w <= 8) { - for (i = 0; i < h; i += 2) { - const __m256i data = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&src_ptr[i * src_stride + src_stride]))), - 0x20); - - __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); - - res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), - round_0_shift); - - res_16b = - _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); - - /* rounding code */ - // 8 bit conversion and saturation to uint8 - __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - - const __m128i res_0 = _mm256_castsi256_si128(res_8b); - const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); - if (w > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); - } else if (w > 2) { - xx_storel_32(&dst[i * dst_stride], res_0); - xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); - } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); - } - } - } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18 - // 19 20 21 22 23 - const __m256i data = _mm256_inserti128_si256( - _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), - 1); - - __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); - - res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), - round_0_shift); - - res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), - round_shift); - - /* rounding code */ - // 8 bit conversion and saturation to uint8 - __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - - // Store values into the destination buffer - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - res_8b = _mm256_permute4x64_epi64(res_8b, 216); - __m128i res = _mm256_castsi256_si128(res_8b); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c deleted file mode 100644 index 5016642de..000000000 --- a/third_party/aom/av1/common/x86/convolve_sse2.c +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_common_intrin.h" -#include "av1/common/convolve.h" - -static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, - const int subpel_q4, - __m128i *const coeffs /* [4] */) { - const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( - filter_params, subpel_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 - coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 -} - -static INLINE __m128i convolve(const __m128i *const s, - const __m128i *const coeffs) { - const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); - const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); - const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); - const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); - const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); - return d; -} - -static INLINE __m128i convolve_lo_x(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); - ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); - ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -static INLINE __m128i convolve_lo_y(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); - ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); - ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -static INLINE __m128i convolve_hi_y(const __m128i *const s, - const __m128i *const coeffs) { - __m128i ss[4]; - ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); - ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); - ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); - ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); - return convolve(ss, coeffs); -} - -void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *src_ptr = src - fo_vert * src_stride; - const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); - __m128i coeffs[4]; - - (void)filter_params_x; - (void)subpel_x_q4; - (void)conv_params; - - assert(conv_params->round_0 <= FILTER_BITS); - assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || - ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); - - if (w <= 4) { - __m128i s[8], src6, res, res_round, res16; - uint32_t res_int; - src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6); - - do { - s[6] = _mm_unpacklo_epi8( - src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride))); - src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6); - - res = convolve_lo_y(s + 0, coeffs); - res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); - res16 = _mm_packs_epi32(res_round, res_round); - res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); - - if (w == 2) - *(uint16_t *)dst = res_int; - else - *(uint32_t *)dst = res_int; - - src_ptr += src_stride; - dst += dst_stride; - - res = convolve_lo_y(s + 1, coeffs); - res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); - res16 = _mm_packs_epi32(res_round, res_round); - res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); - - if (w == 2) - *(uint16_t *)dst = res_int; - else - *(uint32_t *)dst = res_int; - - src_ptr += src_stride; - dst += dst_stride; - - s[0] = s[2]; - s[1] = s[3]; - s[2] = s[4]; - s[3] = s[5]; - s[4] = s[6]; - s[5] = s[7]; - h -= 2; - } while (h); - } else { - assert(!(w % 8)); - int j = 0; - do { - __m128i s[8], src6, res_lo, res_hi; - __m128i res_lo_round, res_hi_round, res16, res; - const uint8_t *data = &src_ptr[j]; - - src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); - - int i = 0; - do { - data = &src_ptr[i * src_stride + j]; - s[6] = _mm_unpacklo_epi8( - src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); - src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); - - res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels - res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels - - res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - res16 = _mm_packs_epi32(res_lo_round, res_hi_round); - res = _mm_packus_epi16(res16, res16); - - _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); - i++; - - res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels - res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels - - res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - res16 = _mm_packs_epi32(res_lo_round, res_hi_round); - res = _mm_packus_epi16(res16, res16); - - _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); - i++; - - s[0] = s[2]; - s[1] = s[3]; - s[2] = s[4]; - s[3] = s[5]; - s[4] = s[6]; - s[5] = s[7]; - } while (i < h); - j += 8; - } while (j < w); - } -} - -void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_0; - const __m128i round_0_const = - _mm_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); - const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); - const __m128i round_shift = _mm_cvtsi32_si128(bits); - __m128i coeffs[4]; - - (void)filter_params_y; - (void)subpel_y_q4; - - assert(bits >= 0); - assert((FILTER_BITS - conv_params->round_1) >= 0 || - ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); - - if (w <= 4) { - do { - const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); - __m128i s[4]; - - s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); - s[1] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); - s[2] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); - s[3] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); - const __m128i res_lo = convolve_lo_x(s, coeffs); - __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); - res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); - - const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); - const __m128i res = _mm_packus_epi16(res16, res16); - - uint32_t r = _mm_cvtsi128_si32(res); - if (w == 2) - *(uint16_t *)dst = r; - else - *(uint32_t *)dst = r; - - src_ptr += src_stride; - dst += dst_stride; - } while (--h); - } else { - assert(!(w % 8)); - int i = 0; - do { - int j = 0; - do { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - __m128i s[4]; - - // Filter even-index pixels - s[0] = data; - s[1] = _mm_srli_si128(data, 2); - s[2] = _mm_srli_si128(data, 4); - s[3] = _mm_srli_si128(data, 6); - const __m128i res_even = convolve_lo_x(s, coeffs); - - // Filter odd-index pixels - s[0] = _mm_srli_si128(data, 1); - s[1] = _mm_srli_si128(data, 3); - s[2] = _mm_srli_si128(data, 5); - s[3] = _mm_srli_si128(data, 7); - const __m128i res_odd = convolve_lo_x(s, coeffs); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); - res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), - round_shift); - __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift); - res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), - round_shift); - - const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); - const __m128i res = _mm_packus_epi16(res16, res16); - - _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); - j += 8; - } while (j < w); - } while (++i < h); - } -} diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c deleted file mode 100644 index c11edc1d4..000000000 --- a/third_party/aom/av1/common/x86/filterintra_sse4.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom_dsp/x86/synonyms.h" -#include "av1/common/enums.h" -#include "av1/common/reconintra.h" - -void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, - TX_SIZE tx_size, const uint8_t *above, - const uint8_t *left, int mode) { - int r, c; - uint8_t buffer[33][33]; - const int bw = tx_size_wide[tx_size]; - const int bh = tx_size_high[tx_size]; - - assert(bw <= 32 && bh <= 32); - - // The initialization is just for silencing Jenkins static analysis warnings - for (r = 0; r < bh + 1; ++r) - memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0])); - - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; - memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); - - const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]); - const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]); - const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]); - const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]); - const __m128i filter_intra_scale_bits = - _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS)); - - for (r = 1; r < bh + 1; r += 2) { - for (c = 1; c < bw + 1; c += 4) { - DECLARE_ALIGNED(16, uint8_t, p[8]); - memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t)); - p[5] = buffer[r][c - 1]; - p[6] = buffer[r + 1][c - 1]; - p[7] = 0; - const __m128i p_b = xx_loadl_64(p); - const __m128i in = _mm_unpacklo_epi64(p_b, p_b); - const __m128i out_01 = _mm_maddubs_epi16(in, f1f0); - const __m128i out_23 = _mm_maddubs_epi16(in, f3f2); - const __m128i out_45 = _mm_maddubs_epi16(in, f5f4); - const __m128i out_67 = _mm_maddubs_epi16(in, f7f6); - const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23); - const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67); - const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567); - // Rounding - const __m128i round_w = - _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits); - const __m128i out_r = _mm_packus_epi16(round_w, round_w); - const __m128i out_r1 = _mm_srli_si128(out_r, 4); - // Storing - xx_storel_32(&buffer[r][c], out_r); - xx_storel_32(&buffer[r + 1][c], out_r1); - } - } - - for (r = 0; r < bh; ++r) { - memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); - dst += stride; - } -} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c deleted file mode 100644 index ae68f0bbb..000000000 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "av1/common/convolve.h" - -void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, - uint16_t *dst, int dst_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, - const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - - __m256i s[8], coeffs_y[4], coeffs_x[4]; - - const __m256i round_const_x = _mm256_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - - const __m256i round_const_y = _mm256_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); - - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); - const __m256i clip_pixel = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m256i zero = _mm256_setzero_si256(); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { - for (i = 0; i < im_h; i += 2) { - const __m256i row0 = - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); - __m256i row1 = _mm256_set1_epi16(0); - if (i + 1 < im_h) - row1 = - _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); - - const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); - const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); - - // even pixels - s[0] = _mm256_alignr_epi8(r1, r0, 0); - s[1] = _mm256_alignr_epi8(r1, r0, 4); - s[2] = _mm256_alignr_epi8(r1, r0, 8); - s[3] = _mm256_alignr_epi8(r1, r0, 12); - - __m256i res_even = convolve(s, coeffs_x); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm256_alignr_epi8(r1, r0, 2); - s[1] = _mm256_alignr_epi8(r1, r0, 6); - s[2] = _mm256_alignr_epi8(r1, r0, 10); - s[3] = _mm256_alignr_epi8(r1, r0, 14); - - __m256i res_odd = convolve(s, coeffs_x); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), - round_shift_x); - - __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); - __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); - __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); - - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - } - - /* Vertical filter */ - { - __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); - __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); - __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); - __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - - s[0] = _mm256_unpacklo_epi16(s0, s1); - s[1] = _mm256_unpacklo_epi16(s2, s3); - s[2] = _mm256_unpacklo_epi16(s4, s5); - - s[4] = _mm256_unpackhi_epi16(s0, s1); - s[5] = _mm256_unpackhi_epi16(s2, s3); - s[6] = _mm256_unpackhi_epi16(s4, s5); - - for (i = 0; i < h; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); - - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); - - const __m256i res_a = convolve(s, coeffs_y); - __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_y), round_shift_y); - - res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); - - if (w - j > 4) { - const __m256i res_b = convolve(s + 4, coeffs_y); - __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_y), round_shift_y); - res_b_round = - _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), - round_shift_bits); - - __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); - res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); - res_16bit = _mm256_max_epi16(res_16bit, zero); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_16bit)); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_16bit, 1)); - } else if (w == 4) { - res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); - res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); - res_a_round = _mm256_max_epi16(res_a_round, zero); - - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_a_round)); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_a_round, 1)); - } else { - res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); - res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); - res_a_round = _mm256_max_epi16(res_a_round, zero); - - xx_storel_32((__m128i *)&dst[i * dst_stride + j], - _mm256_castsi256_si128(res_a_round)); - xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], - _mm256_extracti128_si256(res_a_round, 1)); - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} - -static INLINE void copy_64(const uint16_t *src, uint16_t *dst) { - __m256i s[4]; - s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); - s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); - s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); - s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); - _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); - _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); - _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); -} - -static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { - __m256i s[8]; - s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); - s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); - s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); - s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); - s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16)); - s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16)); - s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16)); - s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16)); - - _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); - _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); - _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); - _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]); - _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]); - _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]); - _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]); -} - -void av1_highbd_convolve_2d_copy_sr_avx2( - const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - (void)conv_params; - (void)bd; - - if (w >= 16) { - assert(!((intptr_t)dst % 16)); - assert(!(dst_stride % 16)); - } - - if (w == 2) { - do { - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 4) { - do { - __m128i s[2]; - s[0] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - s[1] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - _mm_storel_epi64((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_storel_epi64((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 8) { - do { - __m128i s[2]; - s[0] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - s[1] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - _mm_store_si128((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_store_si128((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 16) { - do { - __m256i s[2]; - s[0] = _mm256_loadu_si256((__m256i *)src); - src += src_stride; - s[1] = _mm256_loadu_si256((__m256i *)src); - src += src_stride; - _mm256_storeu_si256((__m256i *)dst, s[0]); - dst += dst_stride; - _mm256_storeu_si256((__m256i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 32) { - do { - __m256i s[4]; - s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); - s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); - src += src_stride; - s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); - s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); - src += src_stride; - _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); - dst += dst_stride; - _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]); - _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 64) { - do { - copy_64(src, dst); - src += src_stride; - dst += dst_stride; - copy_64(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else { - do { - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } -} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c deleted file mode 100644 index 15f8872c1..000000000 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <emmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_filter.h" - -static INLINE void copy_64(const uint16_t *src, uint16_t *dst) { - __m128i s[8]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); - s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); - s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); - s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); - s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); - s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); - _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); - _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); - _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); - _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); - _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); - _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); - _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); -} - -static INLINE void copy_128(const uint16_t *src, uint16_t *dst) { - __m128i s[16]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); - s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); - s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); - s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); - s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); - s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); - s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8)); - s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8)); - s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8)); - s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8)); - s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8)); - s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8)); - s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8)); - s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8)); - _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); - _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); - _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); - _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); - _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); - _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); - _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); - _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]); - _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]); - _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]); - _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]); - _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]); - _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]); - _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]); - _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]); -} - -void av1_highbd_convolve_2d_copy_sr_sse2( - const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - (void)conv_params; - (void)bd; - if (w >= 16) { - assert(!((intptr_t)dst % 16)); - assert(!(dst_stride % 16)); - } - - if (w == 2) { - do { - __m128i s = _mm_loadl_epi64((__m128i *)src); - *(uint32_t *)dst = _mm_cvtsi128_si32(s); - src += src_stride; - dst += dst_stride; - s = _mm_loadl_epi64((__m128i *)src); - *(uint32_t *)dst = _mm_cvtsi128_si32(s); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 4) { - do { - __m128i s[2]; - s[0] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - s[1] = _mm_loadl_epi64((__m128i *)src); - src += src_stride; - _mm_storel_epi64((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_storel_epi64((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 8) { - do { - __m128i s[2]; - s[0] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - s[1] = _mm_loadu_si128((__m128i *)src); - src += src_stride; - _mm_store_si128((__m128i *)dst, s[0]); - dst += dst_stride; - _mm_store_si128((__m128i *)dst, s[1]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 16) { - do { - __m128i s[4]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - src += src_stride; - s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - src += src_stride; - _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); - dst += dst_stride; - _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 32) { - do { - __m128i s[8]; - s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); - s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); - src += src_stride; - s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); - s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); - s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); - s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); - src += src_stride; - _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); - _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); - _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); - dst += dst_stride; - _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]); - _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]); - _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]); - _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]); - dst += dst_stride; - h -= 2; - } while (h); - } else if (w == 64) { - do { - copy_64(src, dst); - src += src_stride; - dst += dst_stride; - copy_64(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } else { - do { - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - copy_128(src, dst); - src += src_stride; - dst += dst_stride; - h -= 2; - } while (h); - } -} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c deleted file mode 100644 index 3f8dafb4b..000000000 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> -#include <smmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_sse2.h" -#include "aom_dsp/x86/convolve_sse4_1.h" -#include "av1/common/convolve.h" - -void av1_highbd_jnt_convolve_2d_copy_sse4_1( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - - const int bits = - FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; - const __m128i left_shift = _mm_cvtsi32_si128(bits); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - const __m128i zero = _mm_setzero_si128(); - int i, j; - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi32(offset); - const __m128i offset_const_16b = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); - const __m128i clip_pixel_to_bd = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - assert(bits <= 4); - - if (!(w % 8)) { - for (i = 0; i < h; i += 1) { - for (j = 0; j < w; j += 8) { - const __m128i src_16bit = - _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); - const __m128i res = _mm_sll_epi16(src_16bit, left_shift); - if (do_average) { - const __m128i data_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); - const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); - - const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); - const __m128i res_unsigned_lo = - _mm_add_epi32(res_32b_lo, offset_const); - - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); - const __m128i res_unsigned_hi = - _mm_add_epi32(res_32b_hi, offset_const); - - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_lo = highbd_convolve_rounding_sse2( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m128i round_result_hi = highbd_convolve_rounding_sse2( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_16b = - _mm_packus_epi32(round_result_lo, round_result_hi); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - const __m128i res_unsigned_16b = - _mm_adds_epu16(res, offset_const_16b); - - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), - res_unsigned_16b); - } - } - } - } else if (!(w % 4)) { - for (i = 0; i < h; i += 2) { - for (j = 0; j < w; j += 4) { - const __m128i src_row_0 = - _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); - const __m128i src_row_1 = - _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); - const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); - - const __m128i res = _mm_sll_epi16(src_10, left_shift); - - if (do_average) { - const __m128i data_0 = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_1 = _mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride])); - - const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); - const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); - - const __m128i res_32b = _mm_unpacklo_epi16(res, zero); - const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); - - const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); - const __m128i res_unsigned_hi = - _mm_add_epi32(res_32b_hi, offset_const); - - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_lo = highbd_convolve_rounding_sse2( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m128i round_result_hi = highbd_convolve_rounding_sse2( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_16b = - _mm_packus_epi32(round_result_lo, round_result_hi); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_1 = _mm_srli_si128(res_clip, 8); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - const __m128i res_unsigned_16b = - _mm_adds_epu16(res, offset_const_16b); - - const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); - - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), - res_unsigned_16b); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - } - } -} - -void av1_highbd_jnt_convolve_2d_sse4_1( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); - const __m128i clip_pixel_to_bd = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i data2 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(data, coeff_01); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - - const __m128i res_unsigned_lo = - _mm_add_epi32(res_lo_round, offset_const); - - if (w < 8) { - if (do_average) { - const __m128i data_0 = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); - - const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result = highbd_convolve_rounding_sse2( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_16b = - _mm_packus_epi32(round_result, round_result); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - const __m128i res_16b = - _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); - } - } else { - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - const __m128i res_unsigned_hi = - _mm_add_epi32(res_hi_round, offset_const); - - if (do_average) { - const __m128i data_lo = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_hi = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); - - const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); - const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); - - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_lo = - highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, - &rounding_const, rounding_shift); - const __m128i round_result_hi = - highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, - &rounding_const, rounding_shift); - - const __m128i res_16b = - _mm_packus_epi32(round_result_lo, round_result_hi); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - const __m128i res_16b = - _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); - } - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c deleted file mode 100644 index 1d029db39..000000000 --- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_sse2.h" -#include "av1/common/convolve.h" - -void av1_highbd_convolve_2d_sr_ssse3( - const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - __m128i coeffs_x[4], coeffs_y[4], s[16]; - - const __m128i round_const_x = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - - const __m128i round_const_y = - _mm_set1_epi32(((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); - - const int bits = - FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); - const __m128i clip_pixel = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m128i zero = _mm_setzero_si128(); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { - for (i = 0; i < im_h; i += 1) { - const __m128i row00 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i row01 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); - - // even pixels - s[0] = _mm_alignr_epi8(row01, row00, 0); - s[1] = _mm_alignr_epi8(row01, row00, 4); - s[2] = _mm_alignr_epi8(row01, row00, 8); - s[3] = _mm_alignr_epi8(row01, row00, 12); - - __m128i res_even = convolve(s, coeffs_x); - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm_alignr_epi8(row01, row00, 2); - s[1] = _mm_alignr_epi8(row01, row00, 6); - s[2] = _mm_alignr_epi8(row01, row00, 10); - s[3] = _mm_alignr_epi8(row01, row00, 14); - - __m128i res_odd = convolve(s, coeffs_x); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); - - __m128i res_even1 = _mm_packs_epi32(res_even, res_even); - __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); - __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); - - _mm_store_si128((__m128i *)&im_block[i * im_stride], res); - } - } - /* Vertical filter */ - { - __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); - __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); - __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); - __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); - __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); - __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); - __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); - - s[0] = _mm_unpacklo_epi16(s0, s1); - s[1] = _mm_unpacklo_epi16(s2, s3); - s[2] = _mm_unpacklo_epi16(s4, s5); - - s[4] = _mm_unpackhi_epi16(s0, s1); - s[5] = _mm_unpackhi_epi16(s2, s3); - s[6] = _mm_unpackhi_epi16(s4, s5); - - s[0 + 8] = _mm_unpacklo_epi16(s1, s2); - s[1 + 8] = _mm_unpacklo_epi16(s3, s4); - s[2 + 8] = _mm_unpacklo_epi16(s5, s6); - - s[4 + 8] = _mm_unpackhi_epi16(s1, s2); - s[5 + 8] = _mm_unpackhi_epi16(s3, s4); - s[6 + 8] = _mm_unpackhi_epi16(s5, s6); - - for (i = 0; i < h; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); - __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride)); - - s[3] = _mm_unpacklo_epi16(s6, s7); - s[7] = _mm_unpackhi_epi16(s6, s7); - - s[3 + 8] = _mm_unpacklo_epi16(s7, s8); - s[7 + 8] = _mm_unpackhi_epi16(s7, s8); - - const __m128i res_a0 = convolve(s, coeffs_y); - __m128i res_a_round0 = - _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y); - res_a_round0 = _mm_sra_epi32( - _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); - - const __m128i res_a1 = convolve(s + 8, coeffs_y); - __m128i res_a_round1 = - _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y); - res_a_round1 = _mm_sra_epi32( - _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); - - if (w - j > 4) { - const __m128i res_b0 = convolve(s + 4, coeffs_y); - __m128i res_b_round0 = _mm_sra_epi32( - _mm_add_epi32(res_b0, round_const_y), round_shift_y); - res_b_round0 = _mm_sra_epi32( - _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits); - - const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); - __m128i res_b_round1 = _mm_sra_epi32( - _mm_add_epi32(res_b1, round_const_y), round_shift_y); - res_b_round1 = _mm_sra_epi32( - _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits); - - __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); - res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); - res_16bit0 = _mm_max_epi16(res_16bit0, zero); - - __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); - res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); - res_16bit1 = _mm_max_epi16(res_16bit1, zero); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_16bit1); - } else if (w == 4) { - res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); - res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); - res_a_round0 = _mm_max_epi16(res_a_round0, zero); - - res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); - res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); - res_a_round1 = _mm_max_epi16(res_a_round1, zero); - - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_a_round1); - } else { - res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); - res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); - res_a_round0 = _mm_max_epi16(res_a_round0, zero); - - res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); - res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); - res_a_round1 = _mm_max_epi16(res_a_round1, zero); - - *((uint32_t *)(&dst[i * dst_stride + j])) = - _mm_cvtsi128_si32(res_a_round0); - - *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = - _mm_cvtsi128_si32(res_a_round1); - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - - s[0 + 8] = s[1 + 8]; - s[1 + 8] = s[2 + 8]; - s[2 + 8] = s[3 + 8]; - - s[4 + 8] = s[5 + 8]; - s[5 + 8] = s[6 + 8]; - s[6 + 8] = s[7 + 8]; - - s6 = s8; - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c deleted file mode 100644 index ade2af03e..000000000 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c +++ /dev/null @@ -1,1349 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <assert.h> -#include <immintrin.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "av1/common/av1_inv_txfm1d_cfg.h" -#include "av1/common/idct.h" -#include "av1/common/x86/av1_inv_txfm_ssse3.h" -#include "av1/common/x86/highbd_txfm_utility_sse4.h" - -// Note: -// Total 32x4 registers to represent 32x32 block coefficients. -// For high bit depth, each coefficient is 4-byte. -// Each __m256i register holds 8 coefficients. -// So each "row" we needs 4 register. Totally 32 rows -// Register layout: -// v0, v1, v2, v3, -// v4, v5, v6, v7, -// ... ... -// v124, v125, v126, v127 - -static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(u, max); - clamped = _mm256_andnot_si256(mask, u); - mask = _mm256_and_si256(mask, max); - clamped = _mm256_or_si256(mask, clamped); - mask = _mm256_cmpgt_epi16(clamped, zero); - clamped = _mm256_and_si256(clamped, mask); - - return clamped; -} - -static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, - __m256i res0, __m256i res1, - const int bd) { - __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); - __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); - - x0 = _mm256_add_epi32(res0, x0); - x1 = _mm256_add_epi32(res1, x1); - x0 = _mm256_packus_epi32(x0, x1); - x0 = _mm256_permute4x64_epi64(x0, 0xd8); - x0 = highbd_clamp_epi16_avx2(x0, bd); - return x0; -} - -static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, - int stride, int flipud, - int height, const int bd) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - for (int i = 0; i < height; ++i, j += step) { - __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); - __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); - - _mm256_storeu_si256((__m256i *)(output + i * stride), u); - } -} - -static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) { - __m256i tmp, round; - round = _mm256_set1_epi32(1 << (bit - 1)); - tmp = _mm256_add_epi32(vec, round); - return _mm256_srai_epi32(tmp, bit); -} - -static INLINE void av1_round_shift_array_32_avx2(__m256i *input, - __m256i *output, - const int size, - const int bit) { - if (bit > 0) { - int i; - for (i = 0; i < size; i++) { - output[i] = av1_round_shift_32_avx2(input[i], bit); - } - } else { - int i; - for (i = 0; i < size; i++) { - output[i] = _mm256_slli_epi32(input[i], -bit); - } - } -} - -static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i x0, x1; - - u0 = _mm256_unpacklo_epi32(in[0], in[1]); - u1 = _mm256_unpackhi_epi32(in[0], in[1]); - - u2 = _mm256_unpacklo_epi32(in[2], in[3]); - u3 = _mm256_unpackhi_epi32(in[2], in[3]); - - u4 = _mm256_unpacklo_epi32(in[4], in[5]); - u5 = _mm256_unpackhi_epi32(in[4], in[5]); - - u6 = _mm256_unpacklo_epi32(in[6], in[7]); - u7 = _mm256_unpackhi_epi32(in[6], in[7]); - - x0 = _mm256_unpacklo_epi64(u0, u2); - x1 = _mm256_unpacklo_epi64(u4, u6); - out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); - - x0 = _mm256_unpackhi_epi64(u0, u2); - x1 = _mm256_unpackhi_epi64(u4, u6); - out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); - - x0 = _mm256_unpacklo_epi64(u1, u3); - x1 = _mm256_unpacklo_epi64(u5, u7); - out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); - - x0 = _mm256_unpackhi_epi64(u1, u3); - x1 = _mm256_unpackhi_epi64(u5, u7); - out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); - out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); -} - -static void load_buffer_32x32(const int32_t *coeff, __m256i *in, - int input_stiride, int size) { - int i; - for (i = 0; i < size; ++i) { - in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride)); - } -} - -static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, - const __m256i *rounding, int bit) { - __m256i x; - x = _mm256_mullo_epi32(*w0, *n0); - x = _mm256_add_epi32(x, *rounding); - x = _mm256_srai_epi32(x, bit); - return x; -} - -static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, - const __m256i *w1, const __m256i *n1, - const __m256i *rounding, int bit) { - __m256i x, y; - - x = _mm256_mullo_epi32(*w0, *n0); - y = _mm256_mullo_epi32(*w1, *n1); - x = _mm256_add_epi32(x, y); - x = _mm256_add_epi32(x, *rounding); - x = _mm256_srai_epi32(x, bit); - return x; -} - -static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, - __m256i *out1, const __m256i *clamp_lo, - const __m256i *clamp_hi) { - __m256i a0 = _mm256_add_epi32(in0, in1); - __m256i a1 = _mm256_sub_epi32(in0, in1); - - a0 = _mm256_max_epi32(a0, *clamp_lo); - a0 = _mm256_min_epi32(a0, *clamp_hi); - a1 = _mm256_max_epi32(a1, *clamp_lo); - a1 = _mm256_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1, - __m256i *out0, __m256i *out1) { - __m256i a0 = _mm256_add_epi32(in0, in1); - __m256i a1 = _mm256_sub_epi32(in0, in1); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_shift_avx2(const __m256i in0, const __m256i in1, - __m256i *out0, __m256i *out1, - const __m256i *clamp_lo, const __m256i *clamp_hi, - int shift) { - __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); - __m256i in0_w_offset = _mm256_add_epi32(in0, offset); - __m256i a0 = _mm256_add_epi32(in0_w_offset, in1); - __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1); - - a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - - a0 = _mm256_max_epi32(a0, *clamp_lo); - a0 = _mm256_min_epi32(a0, *clamp_hi); - a1 = _mm256_max_epi32(a1, *clamp_lo); - a1 = _mm256_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - -static INLINE void idct32_stage4_avx2( - __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, - const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, - const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, - const __m256i *rounding, int bit) { - __m256i temp1, temp2; - temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); - bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); - bf1[17] = temp1; - - temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); - bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); - bf1[18] = temp2; - - temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); - bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); - bf1[21] = temp1; - - temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); - bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); - bf1[22] = temp2; -} - -static INLINE void idct32_stage5_avx2( - __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, - const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, - const __m256i *clamp_hi, const __m256i *rounding, int bit) { - __m256i temp1, temp2; - temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); - bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); - bf1[9] = temp1; - - temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); - bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); - bf1[10] = temp2; - - addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); - addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); - addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); - addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); - addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); - addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); - addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); - addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); -} - -static INLINE void idct32_stage6_avx2( - __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, - const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, - const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, - const __m256i *rounding, int bit) { - __m256i temp1, temp2; - temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); - bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); - bf1[5] = temp1; - - addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); - addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); - addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); - addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); - - temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); - bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); - bf1[18] = temp1; - temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); - bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); - bf1[19] = temp2; - temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); - bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); - bf1[20] = temp1; - temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); - bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); - bf1[21] = temp2; -} - -static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, - const __m256i *cospi32, - const __m256i *clamp_lo, - const __m256i *clamp_hi, - const __m256i *rounding, int bit) { - __m256i temp1, temp2; - addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); - addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); - addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); - addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); - - temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); - bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); - bf1[10] = temp1; - temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); - bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); - bf1[11] = temp2; - - addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); - addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); - addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); - addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); - addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); - addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); - addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); - addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); -} - -static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, - const __m256i *cospi32, - const __m256i *clamp_lo, - const __m256i *clamp_hi, - const __m256i *rounding, int bit) { - __m256i temp1, temp2; - addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); - addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); - addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); - addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); - addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); - addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); - addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); - addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); - - temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); - bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); - bf1[20] = temp1; - temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); - bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); - bf1[21] = temp2; - temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); - bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); - bf1[22] = temp1; - temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); - bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); - bf1[23] = temp2; -} - -static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, - const int do_cols, const int bd, - const int out_shift, - const int log_range) { - if (do_cols) { - addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31); - addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30); - addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29); - addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28); - addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27); - addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26); - addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25); - addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24); - addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23); - addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22); - addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21); - addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20); - addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19); - addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18); - addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17); - addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); - } -} - -static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); - const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); - const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); - __m256i x; - // stage 0 - // stage 1 - // stage 2 - // stage 3 - // stage 4 - // stage 5 - x = _mm256_mullo_epi32(in[0], cospi32); - x = _mm256_add_epi32(x, rounding); - x = _mm256_srai_epi32(x, bit); - - // stage 6 - // stage 7 - // stage 8 - // stage 9 - if (do_cols) { - x = _mm256_max_epi32(x, clamp_lo); - x = _mm256_min_epi32(x, clamp_hi); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); - x = _mm256_add_epi32(offset, x); - x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - x = _mm256_max_epi32(x, clamp_lo_out); - x = _mm256_min_epi32(x, clamp_hi_out); - } - - out[0] = x; - out[1] = x; - out[2] = x; - out[3] = x; - out[4] = x; - out[5] = x; - out[6] = x; - out[7] = x; - out[8] = x; - out[9] = x; - out[10] = x; - out[11] = x; - out[12] = x; - out[13] = x; - out[14] = x; - out[15] = x; - out[16] = x; - out[17] = x; - out[18] = x; - out[19] = x; - out[20] = x; - out[21] = x; - out[22] = x; - out[23] = x; - out[24] = x; - out[25] = x; - out[26] = x; - out[27] = x; - out[28] = x; - out[29] = x; - out[30] = x; - out[31] = x; -} - -static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); - const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); - const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); - const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); - const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); - const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); - const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); - const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); - const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); - const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); - const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); - const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); - const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); - const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); - const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); - const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); - const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); - const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); - const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); - const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); - const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); - const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); - const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); - const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); - const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); - const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); - const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); - const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); - __m256i bf1[32]; - - { - // stage 0 - // stage 1 - bf1[0] = in[0]; - bf1[4] = in[4]; - bf1[8] = in[2]; - bf1[12] = in[6]; - bf1[16] = in[1]; - bf1[20] = in[5]; - bf1[24] = in[3]; - bf1[28] = in[7]; - - // stage 2 - bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); - bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); - bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); - bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); - bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); - bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); - bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); - bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); - - // stage 3 - bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); - bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); - - bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); - bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); - bf1[17] = bf1[16]; - bf1[18] = bf1[19]; - bf1[21] = bf1[20]; - bf1[22] = bf1[23]; - bf1[25] = bf1[24]; - bf1[26] = bf1[27]; - bf1[29] = bf1[28]; - bf1[30] = bf1[31]; - - // stage 4 - bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); - bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); - - bf1[9] = bf1[8]; - bf1[10] = bf1[11]; - bf1[13] = bf1[12]; - bf1[14] = bf1[15]; - - idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, - &cospi24, &cospi40, &cospim24, &rounding, bit); - - // stage 5 - bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); - bf1[1] = bf1[0]; - bf1[5] = bf1[4]; - bf1[6] = bf1[7]; - - idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, - &clamp_hi, &rounding, bit); - - // stage 6 - bf1[3] = bf1[0]; - bf1[2] = bf1[1]; - - idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - - // stage 7 - idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 8 - idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 9 - idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); - } -} - -static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); - const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); - const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); - const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); - const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); - const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); - const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); - const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); - const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); - const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); - const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); - const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); - const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); - const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); - const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); - const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); - const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); - const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); - const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); - const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); - const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); - const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); - const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); - const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); - const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); - const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); - const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); - const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); - const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); - const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); - const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); - const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); - const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); - const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); - const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); - const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); - const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); - const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); - const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); - const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); - __m256i bf1[32]; - - { - // stage 0 - // stage 1 - bf1[0] = in[0]; - bf1[2] = in[8]; - bf1[4] = in[4]; - bf1[6] = in[12]; - bf1[8] = in[2]; - bf1[10] = in[10]; - bf1[12] = in[6]; - bf1[14] = in[14]; - bf1[16] = in[1]; - bf1[18] = in[9]; - bf1[20] = in[5]; - bf1[22] = in[13]; - bf1[24] = in[3]; - bf1[26] = in[11]; - bf1[28] = in[7]; - bf1[30] = in[15]; - - // stage 2 - bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); - bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); - bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); - bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); - bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); - bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); - bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); - bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); - bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); - bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); - bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); - bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); - bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); - bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); - bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); - bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); - - // stage 3 - bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); - bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); - bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); - bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); - bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); - bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); - bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); - bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); - - addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - - // stage 4 - bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); - bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); - bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); - bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); - - addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); - - idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, - &cospi24, &cospi40, &cospim24, &rounding, bit); - - // stage 5 - bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); - bf1[1] = bf1[0]; - bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); - bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); - - addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - - idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, - &clamp_hi, &rounding, bit); - - // stage 6 - addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); - - idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - - // stage 7 - idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 8 - idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 9 - idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range); - } -} - -static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, - int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); - const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); - const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); - const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); - const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); - const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); - const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); - const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); - const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); - const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); - const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); - const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); - const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); - const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); - const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); - const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); - const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); - const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); - const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); - const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); - const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); - const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); - const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); - const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); - const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); - const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); - const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); - const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); - const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); - const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); - const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); - const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); - const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); - const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); - const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); - const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); - const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); - const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); - const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); - const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); - const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); - const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); - const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); - const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); - const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); - const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); - const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); - const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); - const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); - const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); - const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); - const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); - __m256i bf1[32], bf0[32]; - - { - // stage 0 - // stage 1 - bf1[0] = in[0]; - bf1[1] = in[16]; - bf1[2] = in[8]; - bf1[3] = in[24]; - bf1[4] = in[4]; - bf1[5] = in[20]; - bf1[6] = in[12]; - bf1[7] = in[28]; - bf1[8] = in[2]; - bf1[9] = in[18]; - bf1[10] = in[10]; - bf1[11] = in[26]; - bf1[12] = in[6]; - bf1[13] = in[22]; - bf1[14] = in[14]; - bf1[15] = in[30]; - bf1[16] = in[1]; - bf1[17] = in[17]; - bf1[18] = in[9]; - bf1[19] = in[25]; - bf1[20] = in[5]; - bf1[21] = in[21]; - bf1[22] = in[13]; - bf1[23] = in[29]; - bf1[24] = in[3]; - bf1[25] = in[19]; - bf1[26] = in[11]; - bf1[27] = in[27]; - bf1[28] = in[7]; - bf1[29] = in[23]; - bf1[30] = in[15]; - bf1[31] = in[31]; - - // stage 2 - bf0[0] = bf1[0]; - bf0[1] = bf1[1]; - bf0[2] = bf1[2]; - bf0[3] = bf1[3]; - bf0[4] = bf1[4]; - bf0[5] = bf1[5]; - bf0[6] = bf1[6]; - bf0[7] = bf1[7]; - bf0[8] = bf1[8]; - bf0[9] = bf1[9]; - bf0[10] = bf1[10]; - bf0[11] = bf1[11]; - bf0[12] = bf1[12]; - bf0[13] = bf1[13]; - bf0[14] = bf1[14]; - bf0[15] = bf1[15]; - bf0[16] = - half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); - bf0[17] = - half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); - bf0[18] = - half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); - bf0[19] = - half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); - bf0[20] = - half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); - bf0[23] = - half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); - bf0[24] = - half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); - bf0[25] = - half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); - bf0[28] = - half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); - bf0[29] = - half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); - bf0[30] = - half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); - bf0[31] = - half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); - - // stage 3 - bf1[0] = bf0[0]; - bf1[1] = bf0[1]; - bf1[2] = bf0[2]; - bf1[3] = bf0[3]; - bf1[4] = bf0[4]; - bf1[5] = bf0[5]; - bf1[6] = bf0[6]; - bf1[7] = bf0[7]; - bf1[8] = - half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); - bf1[9] = - half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); - bf1[10] = - half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); - bf1[11] = - half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); - bf1[12] = - half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); - bf1[13] = - half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); - bf1[14] = - half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); - bf1[15] = - half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); - - addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - - // stage 4 - bf0[0] = bf1[0]; - bf0[1] = bf1[1]; - bf0[2] = bf1[2]; - bf0[3] = bf1[3]; - bf0[4] = - half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); - bf0[5] = - half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); - bf0[6] = - half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); - bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); - - addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); - - bf0[16] = bf1[16]; - bf0[17] = - half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); - bf0[18] = - half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); - bf0[19] = bf1[19]; - bf0[20] = bf1[20]; - bf0[21] = - half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); - bf0[23] = bf1[23]; - bf0[24] = bf1[24]; - bf0[25] = - half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); - bf0[27] = bf1[27]; - bf0[28] = bf1[28]; - bf0[29] = - half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); - bf0[30] = - half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); - bf0[31] = bf1[31]; - - // stage 5 - bf1[0] = - half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); - bf1[1] = - half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); - bf1[2] = - half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); - bf1[3] = - half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); - addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - bf1[8] = bf0[8]; - bf1[9] = - half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); - bf1[10] = - half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); - bf1[11] = bf0[11]; - bf1[12] = bf0[12]; - bf1[13] = - half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); - bf1[14] = - half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); - bf1[15] = bf0[15]; - addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); - - // stage 6 - addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); - bf0[4] = bf1[4]; - bf0[5] = - half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); - bf0[6] = - half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); - bf0[7] = bf1[7]; - addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); - bf0[16] = bf1[16]; - bf0[17] = bf1[17]; - bf0[18] = - half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); - bf0[19] = - half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); - bf0[20] = - half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); - bf0[22] = bf1[22]; - bf0[23] = bf1[23]; - bf0[24] = bf1[24]; - bf0[25] = bf1[25]; - bf0[26] = - half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); - bf0[28] = - half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); - bf0[29] = - half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); - bf0[30] = bf1[30]; - bf0[31] = bf1[31]; - - // stage 7 - addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); - bf1[8] = bf0[8]; - bf1[9] = bf0[9]; - bf1[10] = - half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); - bf1[11] = - half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); - bf1[12] = - half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); - bf1[13] = - half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); - bf1[14] = bf0[14]; - bf1[15] = bf0[15]; - addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); - - // stage 8 - addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); - addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); - bf0[16] = bf1[16]; - bf0[17] = bf1[17]; - bf0[18] = bf1[18]; - bf0[19] = bf1[19]; - bf0[20] = - half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); - bf0[23] = - half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); - bf0[24] = - half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); - bf0[25] = - half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); - bf0[28] = bf1[28]; - bf0[29] = bf1[29]; - bf0[30] = bf1[30]; - bf0[31] = bf1[31]; - - // stage 9 - if (do_cols) { - addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31); - addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30); - addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29); - addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28); - addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27); - addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26); - addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25); - addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24); - addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23); - addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22); - addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21); - addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20); - addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19); - addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18); - addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17); - addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, - int do_cols, int bd, int out_shift); - -static const transform_1d_avx2 - highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { - { - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - - { { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } } - }; - -static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, - uint16_t *output, int stride, - TX_TYPE tx_type, - TX_SIZE tx_size, int eob, - const int bd) { - __m256i buf1[64 * 2]; - int eobx, eoby; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = txfm_size_col >> 3; - const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - const int input_stride = AOMMIN(32, txfm_size_col); - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_avx2 row_txfm = - highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_avx2 col_txfm = - highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - - // 1st stage: column transform - for (int i = 0; i < buf_size_nonzero_h_div8; i++) { - __m256i buf0[32]; - const int32_t *input_row = input + i * input_stride * 8; - for (int j = 0; j < buf_size_nonzero_w_div8; ++j) { - __m256i *buf0_cur = buf0 + j * 8; - load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8); - - transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]); - } - - row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); - - __m256i *_buf1 = buf1 + i * 8; - for (int j = 0; j < buf_size_w_div8; ++j) { - transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); - } - } - // 2nd stage: column transform - for (int i = 0; i < buf_size_w_div8; i++) { - col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, - inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - - av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row, - buf1 + i * txfm_size_row, txfm_size_row, - -shift[1]); - } - - // write to buffer - { - for (int i = 0; i < (txfm_size_col >> 4); i++) { - highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, - output + 16 * i, stride, ud_flip, - txfm_size_row, bd); - } - } -} - -void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, - uint8_t *output, int stride, - TX_TYPE tx_type, TX_SIZE tx_size, - int eob, const int bd) { - switch (tx_type) { - case DCT_DCT: - highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), - stride, tx_type, tx_size, eob, bd); - break; - default: assert(0); break; - } -} - -void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - const int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); - const TX_SIZE tx_size = txfm_param->tx_size; - switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_8X8: - av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_4X8: - av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); - break; - case TX_8X4: - av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); - break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); - break; - case TX_32X64: - av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); - break; - case TX_64X32: - av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); - break; - case TX_4X4: - av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); - break; - case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); - break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); - break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); - break; - case TX_64X64: - case TX_16X64: - case TX_64X16: - av1_highbd_inv_txfm2d_add_universe_sse4_1( - input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, - txfm_param->eob, txfm_param->bd); - break; - default: assert(0 && "Invalid transform size"); break; - } -} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c deleted file mode 100644 index e29e0baf5..000000000 --- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c +++ /dev/null @@ -1,5348 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include <assert.h> -#include <smmintrin.h> /* SSE4.1 */ - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "av1/common/av1_inv_txfm1d_cfg.h" -#include "av1/common/idct.h" -#include "av1/common/x86/av1_inv_txfm_ssse3.h" -#include "av1/common/x86/av1_txfm_sse4.h" -#include "av1/common/x86/highbd_txfm_utility_sse4.h" - -static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - __m128i clamped, mask; - - mask = _mm_cmpgt_epi16(u, max); - clamped = _mm_andnot_si128(mask, u); - mask = _mm_and_si128(mask, max); - clamped = _mm_or_si128(mask, clamped); - mask = _mm_cmpgt_epi16(clamped, zero); - clamped = _mm_and_si128(clamped, mask); - - return clamped; -} - -static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, - __m128i res0, __m128i res1, - const int bd) { - __m128i x0 = _mm_cvtepi16_epi32(pred); - __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); - - x0 = _mm_add_epi32(res0, x0); - x1 = _mm_add_epi32(res1, x1); - x0 = _mm_packus_epi32(x0, x1); - x0 = highbd_clamp_epi16(x0, bd); - return x0; -} - -static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, - int stride, int flipud, - int height, const int bd) { - int j = flipud ? (height - 1) : 0; - const int step = flipud ? -1 : 1; - for (int i = 0; i < height; ++i, j += step) { - __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); - __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); - - _mm_storeu_si128((__m128i *)(output + i * stride), u); - } -} - -static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, - __m128i *out, int out_size) { - for (int i = 0; i < out_size; ++i) { - out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); - } -} - -static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); - in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); - in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); - in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); -} - -static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, - __m128i *out1, const __m128i *clamp_lo, - const __m128i *clamp_hi) { - __m128i a0 = _mm_add_epi32(in0, in1); - __m128i a1 = _mm_sub_epi32(in0, in1); - - a0 = _mm_max_epi32(a0, *clamp_lo); - a0 = _mm_min_epi32(a0, *clamp_hi); - a1 = _mm_max_epi32(a1, *clamp_lo); - a1 = _mm_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1, - __m128i *out0, __m128i *out1) { - __m128i a0 = _mm_add_epi32(in0, in1); - __m128i a1 = _mm_sub_epi32(in0, in1); - - *out0 = a0; - *out1 = a1; -} - -static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1, - __m128i *out0, __m128i *out1, - const __m128i *clamp_lo, - const __m128i *clamp_hi, int shift) { - __m128i offset = _mm_set1_epi32((1 << shift) >> 1); - __m128i in0_w_offset = _mm_add_epi32(in0, offset); - __m128i a0 = _mm_add_epi32(in0_w_offset, in1); - __m128i a1 = _mm_sub_epi32(in0_w_offset, in1); - - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - - a0 = _mm_max_epi32(a0, *clamp_lo); - a0 = _mm_min_epi32(a0, *clamp_hi); - a1 = _mm_max_epi32(a1, *clamp_lo); - a1 = _mm_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - -static INLINE void idct32_stage4_sse4_1( - __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, - const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, - const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, - const __m128i *rounding, int bit) { - __m128i temp1, temp2; - temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); - bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); - bf1[17] = temp1; - - temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); - bf1[29] = - half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); - bf1[18] = temp2; - - temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); - bf1[26] = - half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); - bf1[21] = temp1; - - temp2 = - half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); - bf1[25] = - half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); - bf1[22] = temp2; -} - -static INLINE void idct32_stage5_sse4_1( - __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, - const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, - const __m128i *clamp_hi, const __m128i *rounding, int bit) { - __m128i temp1, temp2; - temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); - bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); - bf1[9] = temp1; - - temp2 = - half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); - bf1[13] = - half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); - bf1[10] = temp2; - - addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); -} - -static INLINE void idct32_stage6_sse4_1( - __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, - const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, - const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, - const __m128i *rounding, int bit) { - __m128i temp1, temp2; - temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); - bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); - bf1[5] = temp1; - - addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); - - temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); - bf1[29] = - half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); - bf1[18] = temp1; - temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); - bf1[28] = - half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); - bf1[19] = temp2; - temp1 = - half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); - bf1[27] = - half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); - bf1[20] = temp1; - temp2 = - half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); - bf1[26] = - half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); - bf1[21] = temp2; -} - -static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, - const __m128i *cospi32, - const __m128i *clamp_lo, - const __m128i *clamp_hi, - const __m128i *rounding, int bit) { - __m128i temp1, temp2; - addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); - - temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); - bf1[13] = - half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); - bf1[10] = temp1; - temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); - bf1[12] = - half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); - bf1[11] = temp2; - - addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); -} - -static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, - const __m128i *cospi32, - const __m128i *clamp_lo, - const __m128i *clamp_hi, - const __m128i *rounding, int bit) { - __m128i temp1, temp2; - addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); - addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); - - temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); - bf1[27] = - half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); - bf1[20] = temp1; - temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); - bf1[26] = - half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); - bf1[21] = temp2; - temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); - bf1[25] = - half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); - bf1[22] = temp1; - temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); - bf1[24] = - half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); - bf1[23] = temp2; -} - -static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, - const int do_cols, const int bd, - const int out_shift, - const int log_range) { - if (do_cols) { - addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31); - addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30); - addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29); - addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28); - addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27); - addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26); - addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25); - addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24); - addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23); - addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22); - addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21); - addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20); - addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19); - addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18); - addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17); - addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); - } -} - -static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, - __m128i *out0, __m128i *out1, - const __m128i *clamp_lo, const __m128i *clamp_hi, - int shift) { - __m128i offset = _mm_set1_epi32((1 << shift) >> 1); - __m128i a0 = _mm_add_epi32(offset, in0); - __m128i a1 = _mm_sub_epi32(offset, in1); - - a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); - a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); - - a0 = _mm_max_epi32(a0, *clamp_lo); - a0 = _mm_min_epi32(a0, *clamp_hi); - a1 = _mm_max_epi32(a1, *clamp_lo); - a1 = _mm_min_epi32(a1, *clamp_hi); - - *out0 = a0; - *out1 = a1; -} - -static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - - __m128i u0, u1, u2, u3; - __m128i v0, v1, v2, v3, x, y; - - v0 = _mm_unpacklo_epi32(in[0], in[1]); - v1 = _mm_unpackhi_epi32(in[0], in[1]); - v2 = _mm_unpacklo_epi32(in[2], in[3]); - v3 = _mm_unpackhi_epi32(in[2], in[3]); - - u0 = _mm_unpacklo_epi64(v0, v2); - u1 = _mm_unpackhi_epi64(v0, v2); - u2 = _mm_unpacklo_epi64(v1, v3); - u3 = _mm_unpackhi_epi64(v1, v3); - - x = _mm_mullo_epi32(u0, cospi32); - y = _mm_mullo_epi32(u2, cospi32); - v0 = _mm_add_epi32(x, y); - v0 = _mm_add_epi32(v0, rnding); - v0 = _mm_srai_epi32(v0, bit); - - v1 = _mm_sub_epi32(x, y); - v1 = _mm_add_epi32(v1, rnding); - v1 = _mm_srai_epi32(v1, bit); - - x = _mm_mullo_epi32(u1, cospi48); - y = _mm_mullo_epi32(u3, cospim16); - v2 = _mm_add_epi32(x, y); - v2 = _mm_add_epi32(v2, rnding); - v2 = _mm_srai_epi32(v2, bit); - - x = _mm_mullo_epi32(u1, cospi16); - y = _mm_mullo_epi32(u3, cospi48); - v3 = _mm_add_epi32(x, y); - v3 = _mm_add_epi32(v3, rnding); - v3 = _mm_srai_epi32(v3, bit); - - if (do_cols) { - addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3); - addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2); - } else { - const int log_range = AOMMAX(16, bd + 6); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi); - } -} - -static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) { - const int32_t *sinpi = sinpi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); - const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); - const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); - const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); - __m128i t; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i x0, x1, x2, x3; - __m128i u0, u1, u2, u3; - __m128i v0, v1, v2, v3; - - v0 = _mm_unpacklo_epi32(in[0], in[1]); - v1 = _mm_unpackhi_epi32(in[0], in[1]); - v2 = _mm_unpacklo_epi32(in[2], in[3]); - v3 = _mm_unpackhi_epi32(in[2], in[3]); - - x0 = _mm_unpacklo_epi64(v0, v2); - x1 = _mm_unpackhi_epi64(v0, v2); - x2 = _mm_unpacklo_epi64(v1, v3); - x3 = _mm_unpackhi_epi64(v1, v3); - - s0 = _mm_mullo_epi32(x0, sinpi1); - s1 = _mm_mullo_epi32(x0, sinpi2); - s2 = _mm_mullo_epi32(x1, sinpi3); - s3 = _mm_mullo_epi32(x2, sinpi4); - s4 = _mm_mullo_epi32(x2, sinpi1); - s5 = _mm_mullo_epi32(x3, sinpi2); - s6 = _mm_mullo_epi32(x3, sinpi4); - t = _mm_sub_epi32(x0, x2); - s7 = _mm_add_epi32(t, x3); - - t = _mm_add_epi32(s0, s3); - s0 = _mm_add_epi32(t, s5); - t = _mm_sub_epi32(s1, s4); - s1 = _mm_sub_epi32(t, s6); - s3 = s2; - s2 = _mm_mullo_epi32(s7, sinpi3); - - u0 = _mm_add_epi32(s0, s3); - u1 = _mm_add_epi32(s1, s3); - u2 = s2; - t = _mm_add_epi32(s0, s1); - u3 = _mm_sub_epi32(t, s3); - - u0 = _mm_add_epi32(u0, rnding); - u0 = _mm_srai_epi32(u0, bit); - - u1 = _mm_add_epi32(u1, rnding); - u1 = _mm_srai_epi32(u1, bit); - - u2 = _mm_add_epi32(u2, rnding); - u2 = _mm_srai_epi32(u2, bit); - - u3 = _mm_add_epi32(u3, rnding); - u3 = _mm_srai_epi32(u3, bit); - - if (!do_cols) { - const int log_range = AOMMAX(16, bd + 6); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - u0 = _mm_max_epi32(u0, clamp_lo); - u0 = _mm_min_epi32(u0, clamp_hi); - u1 = _mm_max_epi32(u1, clamp_lo); - u1 = _mm_min_epi32(u1, clamp_hi); - u2 = _mm_max_epi32(u2, clamp_lo); - u2 = _mm_min_epi32(u2, clamp_hi); - u3 = _mm_max_epi32(u3, clamp_lo); - u3 = _mm_min_epi32(u3, clamp_hi); - } - - in[0] = u0; - in[1] = u1; - in[2] = u2; - in[3] = u3; -} - -static INLINE void round_shift_4x4(__m128i *in, int shift) { - __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); - - in[0] = _mm_add_epi32(in[0], rnding); - in[1] = _mm_add_epi32(in[1], rnding); - in[2] = _mm_add_epi32(in[2], rnding); - in[3] = _mm_add_epi32(in[3], rnding); - - in[0] = _mm_srai_epi32(in[0], shift); - in[1] = _mm_srai_epi32(in[1], shift); - in[2] = _mm_srai_epi32(in[2], shift); - in[3] = _mm_srai_epi32(in[3], shift); -} - -static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - const __m128i zero = _mm_setzero_si128(); - __m128i u0, u1, u2, u3; - __m128i v0, v1, v2, v3; - - round_shift_4x4(in, shift); - - v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); - v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); - v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); - v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); - - v0 = _mm_unpacklo_epi16(v0, zero); - v1 = _mm_unpacklo_epi16(v1, zero); - v2 = _mm_unpacklo_epi16(v2, zero); - v3 = _mm_unpacklo_epi16(v3, zero); - - if (fliplr) { - in[0] = _mm_shuffle_epi32(in[0], 0x1B); - in[1] = _mm_shuffle_epi32(in[1], 0x1B); - in[2] = _mm_shuffle_epi32(in[2], 0x1B); - in[3] = _mm_shuffle_epi32(in[3], 0x1B); - } - - if (flipud) { - u0 = _mm_add_epi32(in[3], v0); - u1 = _mm_add_epi32(in[2], v1); - u2 = _mm_add_epi32(in[1], v2); - u3 = _mm_add_epi32(in[0], v3); - } else { - u0 = _mm_add_epi32(in[0], v0); - u1 = _mm_add_epi32(in[1], v1); - u2 = _mm_add_epi32(in[2], v2); - u3 = _mm_add_epi32(in[3], v3); - } - - v0 = _mm_packus_epi32(u0, u1); - v2 = _mm_packus_epi32(u2, u3); - - u0 = highbd_clamp_epi16(v0, bd); - u2 = highbd_clamp_epi16(v2, bd); - - v0 = _mm_unpacklo_epi64(u0, u0); - v1 = _mm_unpackhi_epi64(u0, u0); - v2 = _mm_unpacklo_epi64(u2, u2); - v3 = _mm_unpackhi_epi64(u2, u2); - - _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); - _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); - _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); - _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); -} - -void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[4]; - const int8_t *shift = inv_txfm_shift_ls[TX_4X4]; - const int txw_idx = get_txw_idx(TX_4X4); - const int txh_idx = get_txh_idx(TX_4X4); - - switch (tx_type) { - case DCT_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); - break; - case DCT_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); - break; - case FLIPADST_DCT: - load_buffer_4x4(coeff, in); - idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); - break; - case DCT_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); - break; - case FLIPADST_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); - break; - case ADST_FLIPADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); - break; - case FLIPADST_ADST: - load_buffer_4x4(coeff, in); - iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd); - iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd); - write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); - break; - default: assert(0); - } -} - -// 8x8 -static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); - in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); - in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); - in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); - in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); - in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); - in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); - in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); - in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); - in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); - in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); - in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); - in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); - in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); - in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); - in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); -} - -static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x, y; - int col; - - // Note: - // Even column: 0, 2, ..., 14 - // Odd column: 1, 3, ..., 15 - // one even column plus one odd column constructs one row (8 coeffs) - // total we have 8 rows (8x8). - for (col = 0; col < 2; ++col) { - // stage 0 - // stage 1 - // stage 2 - u0 = in[0 * 2 + col]; - u1 = in[4 * 2 + col]; - u2 = in[2 * 2 + col]; - u3 = in[6 * 2 + col]; - - x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); - y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); - u4 = _mm_add_epi32(x, y); - u4 = _mm_add_epi32(u4, rnding); - u4 = _mm_srai_epi32(u4, bit); - - x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); - y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); - u7 = _mm_add_epi32(x, y); - u7 = _mm_add_epi32(u7, rnding); - u7 = _mm_srai_epi32(u7, bit); - - x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); - y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); - u5 = _mm_add_epi32(x, y); - u5 = _mm_add_epi32(u5, rnding); - u5 = _mm_srai_epi32(u5, bit); - - x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); - y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); - u6 = _mm_add_epi32(x, y); - u6 = _mm_add_epi32(u6, rnding); - u6 = _mm_srai_epi32(u6, bit); - - // stage 3 - x = _mm_mullo_epi32(u0, cospi32); - y = _mm_mullo_epi32(u1, cospi32); - v0 = _mm_add_epi32(x, y); - v0 = _mm_add_epi32(v0, rnding); - v0 = _mm_srai_epi32(v0, bit); - - v1 = _mm_sub_epi32(x, y); - v1 = _mm_add_epi32(v1, rnding); - v1 = _mm_srai_epi32(v1, bit); - - x = _mm_mullo_epi32(u2, cospi48); - y = _mm_mullo_epi32(u3, cospim16); - v2 = _mm_add_epi32(x, y); - v2 = _mm_add_epi32(v2, rnding); - v2 = _mm_srai_epi32(v2, bit); - - x = _mm_mullo_epi32(u2, cospi16); - y = _mm_mullo_epi32(u3, cospi48); - v3 = _mm_add_epi32(x, y); - v3 = _mm_add_epi32(v3, rnding); - v3 = _mm_srai_epi32(v3, bit); - - addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); - addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); - - // stage 4 - addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); - u4 = v4; - u7 = v7; - - x = _mm_mullo_epi32(v5, cospi32); - y = _mm_mullo_epi32(v6, cospi32); - u6 = _mm_add_epi32(y, x); - u6 = _mm_add_epi32(u6, rnding); - u6 = _mm_srai_epi32(u6, bit); - - u5 = _mm_sub_epi32(y, x); - u5 = _mm_add_epi32(u5, rnding); - u5 = _mm_srai_epi32(u5, bit); - - // stage 5 - if (do_cols) { - addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col); - addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col); - addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col); - addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, - &clamp_lo_out, &clamp_hi_out, out_shift); - } - } -} - -static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[8], v[8], x; - - // Even 8 points: 0, 2, ..., 14 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[14], cospi4); - x = _mm_mullo_epi32(in[0], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - u[1] = _mm_mullo_epi32(in[14], cospi60); - x = _mm_mullo_epi32(in[0], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // (2) - u[2] = _mm_mullo_epi32(in[10], cospi20); - x = _mm_mullo_epi32(in[4], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_mullo_epi32(in[10], cospi44); - x = _mm_mullo_epi32(in[4], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - // (3) - u[4] = _mm_mullo_epi32(in[6], cospi36); - x = _mm_mullo_epi32(in[8], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[6], cospi28); - x = _mm_mullo_epi32(in[8], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[2], cospi52); - x = _mm_mullo_epi32(in[12], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[2], cospi12); - x = _mm_mullo_epi32(in[12], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 3 - addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); - - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 5 - addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - if (do_cols) { - out[0] = u[0]; - out[2] = _mm_sub_epi32(kZero, u[4]); - out[4] = u[6]; - out[6] = _mm_sub_epi32(kZero, u[2]); - out[8] = u[3]; - out[10] = _mm_sub_epi32(kZero, u[7]); - out[12] = u[5]; - out[14] = _mm_sub_epi32(kZero, u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - - // Odd 8 points: 1, 3, ..., 15 - // stage 0 - // stage 1 - // stage 2 - // (1) - u[0] = _mm_mullo_epi32(in[15], cospi4); - x = _mm_mullo_epi32(in[1], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - u[1] = _mm_mullo_epi32(in[15], cospi60); - x = _mm_mullo_epi32(in[1], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // (2) - u[2] = _mm_mullo_epi32(in[11], cospi20); - x = _mm_mullo_epi32(in[5], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_mullo_epi32(in[11], cospi44); - x = _mm_mullo_epi32(in[5], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - // (3) - u[4] = _mm_mullo_epi32(in[7], cospi36); - x = _mm_mullo_epi32(in[9], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[7], cospi28); - x = _mm_mullo_epi32(in[9], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[3], cospi52); - x = _mm_mullo_epi32(in[13], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[3], cospi12); - x = _mm_mullo_epi32(in[13], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 3 - addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); - - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 5 - addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - if (do_cols) { - out[1] = u[0]; - out[3] = _mm_sub_epi32(kZero, u[4]); - out[5] = u[6]; - out[7] = _mm_sub_epi32(kZero, u[2]); - out[9] = u[3]; - out[11] = _mm_sub_epi32(kZero, u[7]); - out[13] = u[5]; - out[15] = _mm_sub_epi32(kZero, u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } -} - -static void round_shift_8x8(__m128i *in, int shift) { - round_shift_4x4(&in[0], shift); - round_shift_4x4(&in[4], shift); - round_shift_4x4(&in[8], shift); - round_shift_4x4(&in[12], shift); -} - -static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, - int fliplr, int bd) { - __m128i x0, x1; - const __m128i zero = _mm_setzero_si128(); - - x0 = _mm_unpacklo_epi16(pred, zero); - x1 = _mm_unpackhi_epi16(pred, zero); - - if (fliplr) { - res_lo = _mm_shuffle_epi32(res_lo, 0x1B); - res_hi = _mm_shuffle_epi32(res_hi, 0x1B); - x0 = _mm_add_epi32(res_hi, x0); - x1 = _mm_add_epi32(res_lo, x1); - - } else { - x0 = _mm_add_epi32(res_lo, x0); - x1 = _mm_add_epi32(res_hi, x1); - } - - x0 = _mm_packus_epi32(x0, x1); - return highbd_clamp_epi16(x0, bd); -} - -static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, - int fliplr, int flipud, int shift, int bd) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - - round_shift_8x8(in, shift); - - v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); - v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); - v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); - v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); - v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); - v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); - v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); - v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); - - if (flipud) { - u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); - u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); - u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); - u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); - u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); - u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); - u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); - u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); - } else { - u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); - u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); - u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); - u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); - u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); - u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); - u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); - u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); - } - - _mm_store_si128((__m128i *)(output + 0 * stride), u0); - _mm_store_si128((__m128i *)(output + 1 * stride), u1); - _mm_store_si128((__m128i *)(output + 2 * stride), u2); - _mm_store_si128((__m128i *)(output + 3 * stride), u3); - _mm_store_si128((__m128i *)(output + 4 * stride), u4); - _mm_store_si128((__m128i *)(output + 5 * stride), u5); - _mm_store_si128((__m128i *)(output + 6 * stride), u6); - _mm_store_si128((__m128i *)(output + 7 * stride), u7); -} - -void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output, - int stride, TX_TYPE tx_type, int bd) { - __m128i in[16], out[16]; - const int8_t *shift = inv_txfm_shift_ls[TX_8X8]; - const int txw_idx = get_txw_idx(TX_8X8); - const int txh_idx = get_txh_idx(TX_8X8); - - switch (tx_type) { - case DCT_DCT: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); - break; - case DCT_ADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_DCT: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); - break; - case ADST_ADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd); - break; - case FLIPADST_DCT: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); - break; - case DCT_FLIPADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); - break; - case ADST_FLIPADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd); - break; - case FLIPADST_FLIPADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd); - break; - case FLIPADST_ADST: - load_buffer_8x8(coeff, in); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, - -shift[0]); - transpose_8x8(in, out); - iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd); - break; - default: assert(0); - } -} - -static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - __m128i x; - - // stage 0 - // stage 1 - // stage 2 - // stage 3 - x = _mm_mullo_epi32(in[0], cospi32); - x = _mm_add_epi32(x, rnding); - x = _mm_srai_epi32(x, bit); - - // stage 4 - // stage 5 - if (!do_cols) { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - x = _mm_add_epi32(x, offset); - x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - x = _mm_max_epi32(x, clamp_lo_out); - x = _mm_min_epi32(x, clamp_hi_out); - } - - out[0] = x; - out[1] = x; - out[2] = x; - out[3] = x; - out[4] = x; - out[5] = x; - out[6] = x; - out[7] = x; -} - -static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i x, y; - - // stage 0 - // stage 1 - // stage 2 - u0 = in[0]; - u1 = in[4]; - u2 = in[2]; - u3 = in[6]; - - x = _mm_mullo_epi32(in[1], cospi56); - y = _mm_mullo_epi32(in[7], cospim8); - u4 = _mm_add_epi32(x, y); - u4 = _mm_add_epi32(u4, rnding); - u4 = _mm_srai_epi32(u4, bit); - - x = _mm_mullo_epi32(in[1], cospi8); - y = _mm_mullo_epi32(in[7], cospi56); - u7 = _mm_add_epi32(x, y); - u7 = _mm_add_epi32(u7, rnding); - u7 = _mm_srai_epi32(u7, bit); - - x = _mm_mullo_epi32(in[5], cospi24); - y = _mm_mullo_epi32(in[3], cospim40); - u5 = _mm_add_epi32(x, y); - u5 = _mm_add_epi32(u5, rnding); - u5 = _mm_srai_epi32(u5, bit); - - x = _mm_mullo_epi32(in[5], cospi40); - y = _mm_mullo_epi32(in[3], cospi24); - u6 = _mm_add_epi32(x, y); - u6 = _mm_add_epi32(u6, rnding); - u6 = _mm_srai_epi32(u6, bit); - - // stage 3 - x = _mm_mullo_epi32(u0, cospi32); - y = _mm_mullo_epi32(u1, cospi32); - v0 = _mm_add_epi32(x, y); - v0 = _mm_add_epi32(v0, rnding); - v0 = _mm_srai_epi32(v0, bit); - - v1 = _mm_sub_epi32(x, y); - v1 = _mm_add_epi32(v1, rnding); - v1 = _mm_srai_epi32(v1, bit); - - x = _mm_mullo_epi32(u2, cospi48); - y = _mm_mullo_epi32(u3, cospim16); - v2 = _mm_add_epi32(x, y); - v2 = _mm_add_epi32(v2, rnding); - v2 = _mm_srai_epi32(v2, bit); - - x = _mm_mullo_epi32(u2, cospi16); - y = _mm_mullo_epi32(u3, cospi48); - v3 = _mm_add_epi32(x, y); - v3 = _mm_add_epi32(v3, rnding); - v3 = _mm_srai_epi32(v3, bit); - - addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); - addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); - - // stage 4 - addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); - addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); - u4 = v4; - u7 = v7; - - x = _mm_mullo_epi32(v5, cospi32); - y = _mm_mullo_epi32(v6, cospi32); - u6 = _mm_add_epi32(y, x); - u6 = _mm_add_epi32(u6, rnding); - u6 = _mm_srai_epi32(u6, bit); - - u5 = _mm_sub_epi32(y, x); - u5 = _mm_add_epi32(u5, rnding); - u5 = _mm_srai_epi32(u5, bit); - - // stage 5 - if (do_cols) { - addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7); - addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6); - addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5); - addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out, - out_shift); - addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out, - out_shift); - } -} - -static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - __m128i u[8], x; - - // stage 0 - // stage 1 - // stage 2 - - x = _mm_mullo_epi32(in[0], cospi60); - u[0] = _mm_add_epi32(x, rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - x = _mm_mullo_epi32(in[0], cospi4); - u[1] = _mm_sub_epi32(kZero, x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // stage 3 - // stage 4 - __m128i temp1, temp2; - temp1 = _mm_mullo_epi32(u[0], cospi16); - x = _mm_mullo_epi32(u[1], cospi48); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - u[4] = temp1; - - temp2 = _mm_mullo_epi32(u[0], cospi48); - x = _mm_mullo_epi32(u[1], cospi16); - u[5] = _mm_sub_epi32(temp2, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // stage 5 - // stage 6 - temp1 = _mm_mullo_epi32(u[0], cospi32); - x = _mm_mullo_epi32(u[1], cospi32); - u[2] = _mm_add_epi32(temp1, x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(temp1, x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - temp1 = _mm_mullo_epi32(u[4], cospi32); - x = _mm_mullo_epi32(u[5], cospi32); - u[6] = _mm_add_epi32(temp1, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(temp1, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - if (do_cols) { - out[0] = u[0]; - out[1] = _mm_sub_epi32(kZero, u[4]); - out[2] = u[6]; - out[3] = _mm_sub_epi32(kZero, u[2]); - out[4] = u[3]; - out[5] = _mm_sub_epi32(kZero, u[7]); - out[6] = u[5]; - out[7] = _mm_sub_epi32(kZero, u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, - out_shift); - } -} - -static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i kZero = _mm_setzero_si128(); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[8], v[8], x; - - // stage 0 - // stage 1 - // stage 2 - - u[0] = _mm_mullo_epi32(in[7], cospi4); - x = _mm_mullo_epi32(in[0], cospi60); - u[0] = _mm_add_epi32(u[0], x); - u[0] = _mm_add_epi32(u[0], rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - u[1] = _mm_mullo_epi32(in[7], cospi60); - x = _mm_mullo_epi32(in[0], cospi4); - u[1] = _mm_sub_epi32(u[1], x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - // (2) - u[2] = _mm_mullo_epi32(in[5], cospi20); - x = _mm_mullo_epi32(in[2], cospi44); - u[2] = _mm_add_epi32(u[2], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_mullo_epi32(in[5], cospi44); - x = _mm_mullo_epi32(in[2], cospi20); - u[3] = _mm_sub_epi32(u[3], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - // (3) - u[4] = _mm_mullo_epi32(in[3], cospi36); - x = _mm_mullo_epi32(in[4], cospi28); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(in[3], cospi28); - x = _mm_mullo_epi32(in[4], cospi36); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - // (4) - u[6] = _mm_mullo_epi32(in[1], cospi52); - x = _mm_mullo_epi32(in[6], cospi12); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(in[1], cospi12); - x = _mm_mullo_epi32(in[6], cospi52); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 3 - addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); - - // stage 4 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - - u[4] = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - u[5] = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - u[5] = _mm_sub_epi32(u[5], x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_mullo_epi32(v[6], cospim48); - x = _mm_mullo_epi32(v[7], cospi16); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_mullo_epi32(v[6], cospi16); - x = _mm_mullo_epi32(v[7], cospim48); - u[7] = _mm_sub_epi32(u[7], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 5 - addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); - - // stage 6 - u[0] = v[0]; - u[1] = v[1]; - u[4] = v[4]; - u[5] = v[5]; - - v[0] = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - u[2] = _mm_add_epi32(v[0], x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(v[0], x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - v[0] = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - u[6] = _mm_add_epi32(v[0], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(v[0], x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - // stage 7 - if (do_cols) { - out[0] = u[0]; - out[1] = _mm_sub_epi32(kZero, u[4]); - out[2] = u[6]; - out[3] = _mm_sub_epi32(kZero, u[2]); - out[4] = u[3]; - out[5] = _mm_sub_epi32(kZero, u[7]); - out[6] = u[5]; - out[7] = _mm_sub_epi32(kZero, u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, - out_shift); - neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, - out_shift); - } -} - -static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - { - // stage 0 - // stage 1 - // stage 2 - // stage 3 - // stage 4 - in[0] = _mm_mullo_epi32(in[0], cospi32); - in[0] = _mm_add_epi32(in[0], rnding); - in[0] = _mm_srai_epi32(in[0], bit); - - // stage 5 - // stage 6 - // stage 7 - if (do_cols) { - in[0] = _mm_max_epi32(in[0], clamp_lo); - in[0] = _mm_min_epi32(in[0], clamp_hi); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - in[0] = _mm_add_epi32(in[0], offset); - in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); - in[0] = _mm_max_epi32(in[0], clamp_lo_out); - in[0] = _mm_min_epi32(in[0], clamp_hi_out); - } - - out[0] = in[0]; - out[1] = in[0]; - out[2] = in[0]; - out[3] = in[0]; - out[4] = in[0]; - out[5] = in[0]; - out[6] = in[0]; - out[7] = in[0]; - out[8] = in[0]; - out[9] = in[0]; - out[10] = in[0]; - out[11] = in[0]; - out[12] = in[0]; - out[13] = in[0]; - out[14] = in[0]; - out[15] = in[0]; - } -} - -static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], x, y; - - { - // stage 0 - // stage 1 - u[0] = in[0]; - u[2] = in[4]; - u[4] = in[2]; - u[6] = in[6]; - u[8] = in[1]; - u[10] = in[5]; - u[12] = in[3]; - u[14] = in[7]; - - // stage 2 - u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - - u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); - u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); - - u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); - u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); - - u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); - u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); - - // stage 3 - u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); - u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); - u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); - u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); - - addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); - - // stage 4 - x = _mm_mullo_epi32(u[0], cospi32); - u[0] = _mm_add_epi32(x, rnding); - u[0] = _mm_srai_epi32(u[0], bit); - u[1] = u[0]; - - u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); - u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); - - addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); - - x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - u[9] = x; - y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - u[10] = y; - - // stage 5 - addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - - x = _mm_mullo_epi32(u[5], cospi32); - y = _mm_mullo_epi32(u[6], cospi32); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); - - // stage 6 - addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); - - x = _mm_mullo_epi32(u[10], cospi32); - y = _mm_mullo_epi32(u[13], cospi32); - u[10] = _mm_sub_epi32(y, x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); - - u[13] = _mm_add_epi32(x, y); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - x = _mm_mullo_epi32(u[11], cospi32); - y = _mm_mullo_epi32(u[12], cospi32); - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); - - u[12] = _mm_add_epi32(x, y); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15); - addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14); - addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13); - addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12); - addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11); - addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10); - addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9); - addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const __m128i zero = _mm_setzero_si128(); - __m128i v[16], x, y, temp1, temp2; - - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - x = _mm_mullo_epi32(in[0], cospi62); - v[0] = _mm_add_epi32(x, rnding); - v[0] = _mm_srai_epi32(v[0], bit); - - x = _mm_mullo_epi32(in[0], cospi2); - v[1] = _mm_sub_epi32(zero, x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); - - // stage 3 - v[8] = v[0]; - v[9] = v[1]; - - // stage 4 - temp1 = _mm_mullo_epi32(v[8], cospi8); - x = _mm_mullo_epi32(v[9], cospi56); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[8], cospi56); - x = _mm_mullo_epi32(v[9], cospi8); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[8] = temp1; - v[9] = temp2; - - // stage 5 - v[4] = v[0]; - v[5] = v[1]; - v[12] = v[8]; - v[13] = v[9]; - - // stage 6 - temp1 = _mm_mullo_epi32(v[4], cospi16); - x = _mm_mullo_epi32(v[5], cospi48); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[4], cospi48); - x = _mm_mullo_epi32(v[5], cospi16); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[4] = temp1; - v[5] = temp2; - - temp1 = _mm_mullo_epi32(v[12], cospi16); - x = _mm_mullo_epi32(v[13], cospi48); - temp1 = _mm_add_epi32(temp1, x); - temp1 = _mm_add_epi32(temp1, rnding); - temp1 = _mm_srai_epi32(temp1, bit); - - temp2 = _mm_mullo_epi32(v[12], cospi48); - x = _mm_mullo_epi32(v[13], cospi16); - temp2 = _mm_sub_epi32(temp2, x); - temp2 = _mm_add_epi32(temp2, rnding); - temp2 = _mm_srai_epi32(temp2, bit); - v[12] = temp1; - v[13] = temp2; - - // stage 7 - v[2] = v[0]; - v[3] = v[1]; - v[6] = v[4]; - v[7] = v[5]; - v[10] = v[8]; - v[11] = v[9]; - v[14] = v[12]; - v[15] = v[13]; - - // stage 8 - y = _mm_mullo_epi32(v[2], cospi32); - x = _mm_mullo_epi32(v[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - - y = _mm_mullo_epi32(v[6], cospi32); - x = _mm_mullo_epi32(v[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - y = _mm_mullo_epi32(v[10], cospi32); - x = _mm_mullo_epi32(v[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - y = _mm_mullo_epi32(v[14], cospi32); - x = _mm_mullo_epi32(v[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 9 - if (do_cols) { - out[0] = v[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2] = v[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4] = v[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6] = v[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8] = v[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10] = v[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12] = v[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14] = v[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi34 = _mm_set1_epi32(cospi[34]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi42 = _mm_set1_epi32(cospi[42]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi50 = _mm_set1_epi32(cospi[50]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi58 = _mm_set1_epi32(cospi[58]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], x, y; - - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - __m128i zero = _mm_setzero_si128(); - x = _mm_mullo_epi32(in[0], cospi62); - u[0] = _mm_add_epi32(x, rnding); - u[0] = _mm_srai_epi32(u[0], bit); - - x = _mm_mullo_epi32(in[0], cospi2); - u[1] = _mm_sub_epi32(zero, x); - u[1] = _mm_add_epi32(u[1], rnding); - u[1] = _mm_srai_epi32(u[1], bit); - - x = _mm_mullo_epi32(in[2], cospi54); - u[2] = _mm_add_epi32(x, rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - x = _mm_mullo_epi32(in[2], cospi10); - u[3] = _mm_sub_epi32(zero, x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - - x = _mm_mullo_epi32(in[4], cospi46); - u[4] = _mm_add_epi32(x, rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - x = _mm_mullo_epi32(in[4], cospi18); - u[5] = _mm_sub_epi32(zero, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - x = _mm_mullo_epi32(in[6], cospi38); - u[6] = _mm_add_epi32(x, rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - x = _mm_mullo_epi32(in[6], cospi26); - u[7] = _mm_sub_epi32(zero, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - u[8] = _mm_mullo_epi32(in[7], cospi34); - u[8] = _mm_add_epi32(u[8], rnding); - u[8] = _mm_srai_epi32(u[8], bit); - - u[9] = _mm_mullo_epi32(in[7], cospi30); - u[9] = _mm_add_epi32(u[9], rnding); - u[9] = _mm_srai_epi32(u[9], bit); - - u[10] = _mm_mullo_epi32(in[5], cospi42); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); - - u[11] = _mm_mullo_epi32(in[5], cospi22); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); - - u[12] = _mm_mullo_epi32(in[3], cospi50); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - - u[13] = _mm_mullo_epi32(in[3], cospi14); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - u[14] = _mm_mullo_epi32(in[1], cospi58); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - u[15] = _mm_mullo_epi32(in[1], cospi6); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); - - // stage 3 - addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - - // stage 4 - y = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi56); - u[8] = _mm_mullo_epi32(u[8], cospi8); - u[8] = _mm_add_epi32(u[8], x); - u[8] = _mm_add_epi32(u[8], rnding); - u[8] = _mm_srai_epi32(u[8], bit); - - x = _mm_mullo_epi32(u[9], cospi8); - u[9] = _mm_sub_epi32(y, x); - u[9] = _mm_add_epi32(u[9], rnding); - u[9] = _mm_srai_epi32(u[9], bit); - - x = _mm_mullo_epi32(u[11], cospi24); - y = _mm_mullo_epi32(u[10], cospi24); - u[10] = _mm_mullo_epi32(u[10], cospi40); - u[10] = _mm_add_epi32(u[10], x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); - - x = _mm_mullo_epi32(u[11], cospi40); - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); - - x = _mm_mullo_epi32(u[13], cospi8); - y = _mm_mullo_epi32(u[12], cospi8); - u[12] = _mm_mullo_epi32(u[12], cospim56); - u[12] = _mm_add_epi32(u[12], x); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - - x = _mm_mullo_epi32(u[13], cospim56); - u[13] = _mm_sub_epi32(y, x); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - x = _mm_mullo_epi32(u[15], cospi40); - y = _mm_mullo_epi32(u[14], cospi40); - u[14] = _mm_mullo_epi32(u[14], cospim24); - u[14] = _mm_add_epi32(u[14], x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - x = _mm_mullo_epi32(u[15], cospim24); - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); - - // stage 5 - addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); - - // stage 6 - x = _mm_mullo_epi32(u[5], cospi48); - y = _mm_mullo_epi32(u[4], cospi48); - u[4] = _mm_mullo_epi32(u[4], cospi16); - u[4] = _mm_add_epi32(u[4], x); - u[4] = _mm_add_epi32(u[4], rnding); - u[4] = _mm_srai_epi32(u[4], bit); - - x = _mm_mullo_epi32(u[5], cospi16); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - x = _mm_mullo_epi32(u[7], cospi16); - y = _mm_mullo_epi32(u[6], cospi16); - u[6] = _mm_mullo_epi32(u[6], cospim48); - u[6] = _mm_add_epi32(u[6], x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - x = _mm_mullo_epi32(u[7], cospim48); - u[7] = _mm_sub_epi32(y, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - x = _mm_mullo_epi32(u[13], cospi48); - y = _mm_mullo_epi32(u[12], cospi48); - u[12] = _mm_mullo_epi32(u[12], cospi16); - u[12] = _mm_add_epi32(u[12], x); - u[12] = _mm_add_epi32(u[12], rnding); - u[12] = _mm_srai_epi32(u[12], bit); - - x = _mm_mullo_epi32(u[13], cospi16); - u[13] = _mm_sub_epi32(y, x); - u[13] = _mm_add_epi32(u[13], rnding); - u[13] = _mm_srai_epi32(u[13], bit); - - x = _mm_mullo_epi32(u[15], cospi16); - y = _mm_mullo_epi32(u[14], cospi16); - u[14] = _mm_mullo_epi32(u[14], cospim48); - u[14] = _mm_add_epi32(u[14], x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - x = _mm_mullo_epi32(u[15], cospim48); - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); - - // stage 7 - addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); - - // stage 8 - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - u[2] = _mm_add_epi32(y, x); - u[2] = _mm_add_epi32(u[2], rnding); - u[2] = _mm_srai_epi32(u[2], bit); - - u[3] = _mm_sub_epi32(y, x); - u[3] = _mm_add_epi32(u[3], rnding); - u[3] = _mm_srai_epi32(u[3], bit); - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = _mm_sub_epi32(y, x); - u[7] = _mm_add_epi32(u[7], rnding); - u[7] = _mm_srai_epi32(u[7], bit); - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - u[10] = _mm_add_epi32(y, x); - u[10] = _mm_add_epi32(u[10], rnding); - u[10] = _mm_srai_epi32(u[10], bit); - - u[11] = _mm_sub_epi32(y, x); - u[11] = _mm_add_epi32(u[11], rnding); - u[11] = _mm_srai_epi32(u[11], bit); - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - u[14] = _mm_add_epi32(y, x); - u[14] = _mm_add_epi32(u[14], rnding); - u[14] = _mm_srai_epi32(u[14], bit); - - u[15] = _mm_sub_epi32(y, x); - u[15] = _mm_add_epi32(u[15], rnding); - u[15] = _mm_srai_epi32(u[15], bit); - - // stage 9 - if (do_cols) { - out[0] = u[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]); - out[2] = u[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]); - out[4] = u[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]); - out[6] = u[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]); - out[8] = u[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]); - out[10] = u[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]); - out[12] = u[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]); - out[14] = u[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - - { - // stage 0 - // stage 1 - u[0] = in[0]; - u[1] = in[8]; - u[2] = in[4]; - u[3] = in[12]; - u[4] = in[2]; - u[5] = in[10]; - u[6] = in[6]; - u[7] = in[14]; - u[8] = in[1]; - u[9] = in[9]; - u[10] = in[5]; - u[11] = in[13]; - u[12] = in[3]; - u[13] = in[11]; - u[14] = in[7]; - u[15] = in[15]; - - // stage 2 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; - - v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); - v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); - v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); - - // stage 3 - u[0] = v[0]; - u[1] = v[1]; - u[2] = v[2]; - u[3] = v[3]; - u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); - u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); - u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); - addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); - - // stage 4 - x = _mm_mullo_epi32(u[0], cospi32); - y = _mm_mullo_epi32(u[1], cospi32); - v[0] = _mm_add_epi32(x, y); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); - - v[1] = _mm_sub_epi32(x, y); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); - - v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); - v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[11] = u[11]; - v[12] = u[12]; - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - v[15] = u[15]; - - // stage 5 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - u[4] = v[4]; - - x = _mm_mullo_epi32(v[5], cospi32); - y = _mm_mullo_epi32(v[6], cospi32); - u[5] = _mm_sub_epi32(y, x); - u[5] = _mm_add_epi32(u[5], rnding); - u[5] = _mm_srai_epi32(u[5], bit); - - u[6] = _mm_add_epi32(y, x); - u[6] = _mm_add_epi32(u[6], rnding); - u[6] = _mm_srai_epi32(u[6], bit); - - u[7] = v[7]; - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); - - // stage 6 - addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); - v[8] = u[8]; - v[9] = u[9]; - - x = _mm_mullo_epi32(u[10], cospi32); - y = _mm_mullo_epi32(u[13], cospi32); - v[10] = _mm_sub_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[13] = _mm_add_epi32(x, y); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - x = _mm_mullo_epi32(u[11], cospi32); - y = _mm_mullo_epi32(u[12], cospi32); - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_add_epi32(x, y); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[14] = u[14]; - v[15] = u[15]; - - // stage 7 - if (do_cols) { - addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15); - addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14); - addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13); - addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12); - addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11); - addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10); - addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9); - addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi34 = _mm_set1_epi32(cospi[34]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi42 = _mm_set1_epi32(cospi[42]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi50 = _mm_set1_epi32(cospi[50]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi58 = _mm_set1_epi32(cospi[58]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i u[16], v[16], x, y; - - // Calculate the column 0, 1, 2, 3 - { - // stage 0 - // stage 1 - // stage 2 - v[0] = _mm_mullo_epi32(in[15], cospi2); - x = _mm_mullo_epi32(in[0], cospi62); - v[0] = _mm_add_epi32(v[0], x); - v[0] = _mm_add_epi32(v[0], rnding); - v[0] = _mm_srai_epi32(v[0], bit); - - v[1] = _mm_mullo_epi32(in[15], cospi62); - x = _mm_mullo_epi32(in[0], cospi2); - v[1] = _mm_sub_epi32(v[1], x); - v[1] = _mm_add_epi32(v[1], rnding); - v[1] = _mm_srai_epi32(v[1], bit); - - v[2] = _mm_mullo_epi32(in[13], cospi10); - x = _mm_mullo_epi32(in[2], cospi54); - v[2] = _mm_add_epi32(v[2], x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_mullo_epi32(in[13], cospi54); - x = _mm_mullo_epi32(in[2], cospi10); - v[3] = _mm_sub_epi32(v[3], x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - - v[4] = _mm_mullo_epi32(in[11], cospi18); - x = _mm_mullo_epi32(in[4], cospi46); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(in[11], cospi46); - x = _mm_mullo_epi32(in[4], cospi18); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(in[9], cospi26); - x = _mm_mullo_epi32(in[6], cospi38); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(in[9], cospi38); - x = _mm_mullo_epi32(in[6], cospi26); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = _mm_mullo_epi32(in[7], cospi34); - x = _mm_mullo_epi32(in[8], cospi30); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(in[7], cospi30); - x = _mm_mullo_epi32(in[8], cospi34); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(in[5], cospi42); - x = _mm_mullo_epi32(in[10], cospi22); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(in[5], cospi22); - x = _mm_mullo_epi32(in[10], cospi42); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(in[3], cospi50); - x = _mm_mullo_epi32(in[12], cospi14); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(in[3], cospi14); - x = _mm_mullo_epi32(in[12], cospi50); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(in[1], cospi58); - x = _mm_mullo_epi32(in[14], cospi6); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(in[1], cospi6); - x = _mm_mullo_epi32(in[14], cospi58); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 3 - addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); - - // stage 4 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - v[4] = u[4]; - v[5] = u[5]; - v[6] = u[6]; - v[7] = u[7]; - - v[8] = _mm_mullo_epi32(u[8], cospi8); - x = _mm_mullo_epi32(u[9], cospi56); - v[8] = _mm_add_epi32(v[8], x); - v[8] = _mm_add_epi32(v[8], rnding); - v[8] = _mm_srai_epi32(v[8], bit); - - v[9] = _mm_mullo_epi32(u[8], cospi56); - x = _mm_mullo_epi32(u[9], cospi8); - v[9] = _mm_sub_epi32(v[9], x); - v[9] = _mm_add_epi32(v[9], rnding); - v[9] = _mm_srai_epi32(v[9], bit); - - v[10] = _mm_mullo_epi32(u[10], cospi40); - x = _mm_mullo_epi32(u[11], cospi24); - v[10] = _mm_add_epi32(v[10], x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_mullo_epi32(u[10], cospi24); - x = _mm_mullo_epi32(u[11], cospi40); - v[11] = _mm_sub_epi32(v[11], x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = _mm_mullo_epi32(u[12], cospim56); - x = _mm_mullo_epi32(u[13], cospi8); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi8); - x = _mm_mullo_epi32(u[13], cospim56); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim24); - x = _mm_mullo_epi32(u[15], cospi40); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi40); - x = _mm_mullo_epi32(u[15], cospim24); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 5 - addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); - - // stage 6 - v[0] = u[0]; - v[1] = u[1]; - v[2] = u[2]; - v[3] = u[3]; - - v[4] = _mm_mullo_epi32(u[4], cospi16); - x = _mm_mullo_epi32(u[5], cospi48); - v[4] = _mm_add_epi32(v[4], x); - v[4] = _mm_add_epi32(v[4], rnding); - v[4] = _mm_srai_epi32(v[4], bit); - - v[5] = _mm_mullo_epi32(u[4], cospi48); - x = _mm_mullo_epi32(u[5], cospi16); - v[5] = _mm_sub_epi32(v[5], x); - v[5] = _mm_add_epi32(v[5], rnding); - v[5] = _mm_srai_epi32(v[5], bit); - - v[6] = _mm_mullo_epi32(u[6], cospim48); - x = _mm_mullo_epi32(u[7], cospi16); - v[6] = _mm_add_epi32(v[6], x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_mullo_epi32(u[6], cospi16); - x = _mm_mullo_epi32(u[7], cospim48); - v[7] = _mm_sub_epi32(v[7], x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - v[10] = u[10]; - v[11] = u[11]; - - v[12] = _mm_mullo_epi32(u[12], cospi16); - x = _mm_mullo_epi32(u[13], cospi48); - v[12] = _mm_add_epi32(v[12], x); - v[12] = _mm_add_epi32(v[12], rnding); - v[12] = _mm_srai_epi32(v[12], bit); - - v[13] = _mm_mullo_epi32(u[12], cospi48); - x = _mm_mullo_epi32(u[13], cospi16); - v[13] = _mm_sub_epi32(v[13], x); - v[13] = _mm_add_epi32(v[13], rnding); - v[13] = _mm_srai_epi32(v[13], bit); - - v[14] = _mm_mullo_epi32(u[14], cospim48); - x = _mm_mullo_epi32(u[15], cospi16); - v[14] = _mm_add_epi32(v[14], x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_mullo_epi32(u[14], cospi16); - x = _mm_mullo_epi32(u[15], cospim48); - v[15] = _mm_sub_epi32(v[15], x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 7 - addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); - - // stage 8 - v[0] = u[0]; - v[1] = u[1]; - - y = _mm_mullo_epi32(u[2], cospi32); - x = _mm_mullo_epi32(u[3], cospi32); - v[2] = _mm_add_epi32(y, x); - v[2] = _mm_add_epi32(v[2], rnding); - v[2] = _mm_srai_epi32(v[2], bit); - - v[3] = _mm_sub_epi32(y, x); - v[3] = _mm_add_epi32(v[3], rnding); - v[3] = _mm_srai_epi32(v[3], bit); - - v[4] = u[4]; - v[5] = u[5]; - - y = _mm_mullo_epi32(u[6], cospi32); - x = _mm_mullo_epi32(u[7], cospi32); - v[6] = _mm_add_epi32(y, x); - v[6] = _mm_add_epi32(v[6], rnding); - v[6] = _mm_srai_epi32(v[6], bit); - - v[7] = _mm_sub_epi32(y, x); - v[7] = _mm_add_epi32(v[7], rnding); - v[7] = _mm_srai_epi32(v[7], bit); - - v[8] = u[8]; - v[9] = u[9]; - - y = _mm_mullo_epi32(u[10], cospi32); - x = _mm_mullo_epi32(u[11], cospi32); - v[10] = _mm_add_epi32(y, x); - v[10] = _mm_add_epi32(v[10], rnding); - v[10] = _mm_srai_epi32(v[10], bit); - - v[11] = _mm_sub_epi32(y, x); - v[11] = _mm_add_epi32(v[11], rnding); - v[11] = _mm_srai_epi32(v[11], bit); - - v[12] = u[12]; - v[13] = u[13]; - - y = _mm_mullo_epi32(u[14], cospi32); - x = _mm_mullo_epi32(u[15], cospi32); - v[14] = _mm_add_epi32(y, x); - v[14] = _mm_add_epi32(v[14], rnding); - v[14] = _mm_srai_epi32(v[14], bit); - - v[15] = _mm_sub_epi32(y, x); - v[15] = _mm_add_epi32(v[15], rnding); - v[15] = _mm_srai_epi32(v[15], bit); - - // stage 9 - if (do_cols) { - out[0] = v[0]; - out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]); - out[2] = v[12]; - out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]); - out[4] = v[6]; - out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]); - out[6] = v[10]; - out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]); - out[8] = v[3]; - out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]); - out[10] = v[15]; - out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]); - out[12] = v[5]; - out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]); - out[14] = v[9]; - out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); - const __m128i clamp_hi_out = - _mm_set1_epi32((1 << (log_range_out - 1)) - 1); - - neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, - &clamp_hi_out, out_shift); - neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, - &clamp_hi_out, out_shift); - } - } -} - -static INLINE void idct64_stage8_sse4_1( - __m128i *u, const __m128i *cospim32, const __m128i *cospi32, - const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, - const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, - const __m128i *rnding, int bit) { - int i; - __m128i temp1, temp2, temp3, temp4; - temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); - u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); - u[10] = temp1; - temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); - u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); - u[11] = temp2; - - for (i = 16; i < 20; ++i) { - addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); - addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, - clamp_hi); - } - - temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); - temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); - temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); - temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); - u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); - u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); - u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); - u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); - u[36] = temp1; - u[37] = temp2; - u[38] = temp3; - u[39] = temp4; - - temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); - temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); - temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); - temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); - u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); - u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); - u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); - u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); - u[40] = temp1; - u[41] = temp2; - u[42] = temp3; - u[43] = temp4; -} - -static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, - const __m128i *cospi32, - const __m128i *clamp_lo, - const __m128i *clamp_hi, - const __m128i *rnding, int bit) { - int i; - __m128i temp1, temp2, temp3, temp4; - for (i = 0; i < 8; ++i) { - addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); - } - - temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); - temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); - temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); - temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); - u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); - u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); - u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); - u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); - u[20] = temp1; - u[21] = temp2; - u[22] = temp3; - u[23] = temp4; - for (i = 32; i < 40; i++) { - addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); - } - - for (i = 48; i < 56; i++) { - addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); - } -} - -static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, - const __m128i *cospi32, - const __m128i *clamp_lo, - const __m128i *clamp_hi, - const __m128i *rnding, int bit) { - __m128i temp1, temp2, temp3, temp4; - for (int i = 0; i < 16; i++) { - addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); - } - - temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); - temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); - temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); - temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); - u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); - u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); - u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); - u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); - u[40] = temp1; - u[41] = temp2; - u[42] = temp3; - u[43] = temp4; - - temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); - temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); - temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); - temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); - u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); - u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); - u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); - u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); - u[44] = temp1; - u[45] = temp2; - u[46] = temp3; - u[47] = temp4; -} - -static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, - int bd, int out_shift, - const int log_range) { - if (do_cols) { - for (int i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]); - } - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - for (int i = 0; i < 32; i++) { - addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)], - &clamp_lo_out, &clamp_hi_out, out_shift); - } - } -} - -static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - - { - __m128i x; - - // stage 1 - // stage 2 - // stage 3 - // stage 4 - // stage 5 - // stage 6 - x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); - - // stage 8 - // stage 9 - // stage 10 - // stage 11 - if (do_cols) { - x = _mm_max_epi32(x, clamp_lo); - x = _mm_min_epi32(x, clamp_hi); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - x = _mm_add_epi32(x, offset); - x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); - - x = _mm_max_epi32(x, clamp_lo_out); - x = _mm_min_epi32(x, clamp_hi_out); - } - - out[0] = x; - out[63] = x; - out[1] = x; - out[62] = x; - out[2] = x; - out[61] = x; - out[3] = x; - out[60] = x; - out[4] = x; - out[59] = x; - out[5] = x; - out[58] = x; - out[6] = x; - out[57] = x; - out[7] = x; - out[56] = x; - out[8] = x; - out[55] = x; - out[9] = x; - out[54] = x; - out[10] = x; - out[53] = x; - out[11] = x; - out[52] = x; - out[12] = x; - out[51] = x; - out[13] = x; - out[50] = x; - out[14] = x; - out[49] = x; - out[15] = x; - out[48] = x; - out[16] = x; - out[47] = x; - out[17] = x; - out[46] = x; - out[18] = x; - out[45] = x; - out[19] = x; - out[44] = x; - out[20] = x; - out[43] = x; - out[21] = x; - out[42] = x; - out[22] = x; - out[41] = x; - out[23] = x; - out[40] = x; - out[24] = x; - out[39] = x; - out[25] = x; - out[38] = x; - out[26] = x; - out[37] = x; - out[27] = x; - out[36] = x; - out[28] = x; - out[35] = x; - out[29] = x; - out[34] = x; - out[30] = x; - out[33] = x; - out[31] = x; - out[32] = x; - } -} - -static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - int i, j; - const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - const __m128i cospi1 = _mm_set1_epi32(cospi[1]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi3 = _mm_set1_epi32(cospi[3]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospi63 = _mm_set1_epi32(cospi[63]); - const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); - const __m128i cospi7 = _mm_set1_epi32(cospi[7]); - const __m128i cospi5 = _mm_set1_epi32(cospi[5]); - const __m128i cospi59 = _mm_set1_epi32(cospi[59]); - const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - - { - __m128i u[64]; - - // stage 1 - u[0] = in[0]; - u[8] = in[4]; - u[16] = in[2]; - u[24] = in[6]; - u[32] = in[1]; - u[40] = in[5]; - u[48] = in[3]; - u[56] = in[7]; - - // stage 2 - u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); - u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); - u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); - u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); - u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); - u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); - u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); - u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); - - // stage 3 - u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); - u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); - u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); - u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); - u[33] = u[32]; - u[38] = u[39]; - u[41] = u[40]; - u[46] = u[47]; - u[49] = u[48]; - u[54] = u[55]; - u[57] = u[56]; - u[62] = u[63]; - - // stage 4 - __m128i temp1, temp2; - u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - u[17] = u[16]; - u[22] = u[23]; - u[25] = u[24]; - u[30] = u[31]; - - temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); - u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); - u[33] = temp1; - - temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); - u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); - u[57] = temp2; - - temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); - u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); - u[41] = temp1; - - temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); - u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); - u[46] = temp2; - - // stage 5 - u[9] = u[8]; - u[14] = u[15]; - - temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); - u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); - u[17] = temp1; - - temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); - u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); - u[22] = temp2; - - u[35] = u[32]; - u[34] = u[33]; - u[36] = u[39]; - u[37] = u[38]; - u[43] = u[40]; - u[42] = u[41]; - u[44] = u[47]; - u[45] = u[46]; - u[51] = u[48]; - u[50] = u[49]; - u[52] = u[55]; - u[53] = u[54]; - u[59] = u[56]; - u[58] = u[57]; - u[60] = u[63]; - u[61] = u[62]; - - // stage 6 - temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - u[0] = temp1; - - temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - u[9] = temp2; - u[19] = u[16]; - u[18] = u[17]; - u[20] = u[23]; - u[21] = u[22]; - u[27] = u[24]; - u[26] = u[25]; - u[28] = u[31]; - u[29] = u[30]; - - temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); - u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); - u[34] = temp1; - temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); - u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); - u[35] = temp2; - temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); - u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); - u[36] = temp1; - temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); - u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); - u[37] = temp2; - temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); - u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); - u[42] = temp1; - temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); - u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); - u[43] = temp2; - temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); - u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); - u[44] = temp1; - temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); - u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); - u[45] = temp2; - - // stage 7 - u[3] = u[0]; - u[2] = u[1]; - u[11] = u[8]; - u[10] = u[9]; - u[12] = u[15]; - u[13] = u[14]; - - temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); - u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); - u[18] = temp1; - temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); - u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); - u[19] = temp2; - temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); - u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); - u[20] = temp1; - temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); - u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); - u[21] = temp2; - for (i = 32; i < 64; i += 16) { - for (j = i; j < i + 4; j++) { - addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, - &clamp_hi); - } - } - - // stage 8 - u[7] = u[0]; - u[6] = u[1]; - u[5] = u[2]; - u[4] = u[3]; - u[9] = u[9]; - - idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); - - // stage 9 - idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, - bit); - - // stage 10 - idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, - bit); - - // stage 11 - idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); - } -} - -static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - int i, j; - const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - const __m128i cospi1 = _mm_set1_epi32(cospi[1]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi3 = _mm_set1_epi32(cospi[3]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi5 = _mm_set1_epi32(cospi[5]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi7 = _mm_set1_epi32(cospi[7]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi9 = _mm_set1_epi32(cospi[9]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi11 = _mm_set1_epi32(cospi[11]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi13 = _mm_set1_epi32(cospi[13]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi15 = _mm_set1_epi32(cospi[15]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi51 = _mm_set1_epi32(cospi[51]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi55 = _mm_set1_epi32(cospi[55]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi59 = _mm_set1_epi32(cospi[59]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi63 = _mm_set1_epi32(cospi[63]); - - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); - const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); - - { - __m128i u[64]; - __m128i tmp1, tmp2, tmp3, tmp4; - // stage 1 - u[0] = in[0]; - u[32] = in[1]; - u[36] = in[9]; - u[40] = in[5]; - u[44] = in[13]; - u[48] = in[3]; - u[52] = in[11]; - u[56] = in[7]; - u[60] = in[15]; - u[16] = in[2]; - u[20] = in[10]; - u[24] = in[6]; - u[28] = in[14]; - u[4] = in[8]; - u[8] = in[4]; - u[12] = in[12]; - - // stage 2 - u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); - u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); - u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); - u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); - u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); - u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); - u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); - u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); - u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); - u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); - u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); - u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); - u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); - u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); - u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); - u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); - - // stage 3 - u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); - u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); - u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); - u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); - u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); - u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); - u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); - u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); - u[33] = u[32]; - u[34] = u[35]; - u[37] = u[36]; - u[38] = u[39]; - u[41] = u[40]; - u[42] = u[43]; - u[45] = u[44]; - u[46] = u[47]; - u[49] = u[48]; - u[50] = u[51]; - u[53] = u[52]; - u[54] = u[55]; - u[57] = u[56]; - u[58] = u[59]; - u[61] = u[60]; - u[62] = u[63]; - - // stage 4 - u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); - u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); - - u[17] = u[16]; - u[18] = u[19]; - u[21] = u[20]; - u[22] = u[23]; - u[25] = u[24]; - u[26] = u[27]; - u[29] = u[28]; - u[30] = u[31]; - - tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); - u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); - u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); - u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); - u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); - u[33] = tmp1; - u[34] = tmp2; - u[37] = tmp3; - u[38] = tmp4; - - tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); - u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); - u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); - u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); - u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); - u[41] = tmp1; - u[42] = tmp2; - u[45] = tmp3; - u[46] = tmp4; - - // stage 5 - u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); - u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); - - u[9] = u[8]; - u[10] = u[11]; - u[13] = u[12]; - u[14] = u[15]; - - tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); - u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); - u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); - u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); - u[17] = tmp1; - u[18] = tmp2; - u[21] = tmp3; - u[22] = tmp4; - - for (i = 32; i < 64; i += 8) { - addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, - &clamp_hi); - - addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, - &clamp_hi); - } - - // stage 6 - tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - u[0] = tmp1; - u[5] = u[4]; - u[6] = u[7]; - - tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - u[9] = tmp1; - tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - u[10] = tmp2; - - for (i = 16; i < 32; i += 8) { - addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, - &clamp_hi); - - addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, - &clamp_hi); - } - - tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); - u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); - u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); - u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); - u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); - u[34] = tmp1; - u[35] = tmp2; - u[36] = tmp3; - u[37] = tmp4; - - tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); - u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); - u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); - u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); - u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); - u[42] = tmp1; - u[43] = tmp2; - u[44] = tmp3; - u[45] = tmp4; - - // stage 7 - u[3] = u[0]; - u[2] = u[1]; - tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); - u[5] = tmp1; - addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); - - tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); - tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); - tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); - tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); - u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); - u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); - u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); - u[18] = tmp1; - u[19] = tmp2; - u[20] = tmp3; - u[21] = tmp4; - - for (i = 32; i < 64; i += 16) { - for (j = i; j < i + 4; j++) { - addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, - &clamp_hi); - } - } - - // stage 8 - for (i = 0; i < 4; ++i) { - addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); - } - - idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); - - // stage 9 - idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, - bit); - - // stage 10 - idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, - bit); - - // stage 11 - idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range); - } -} - -static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - int i, j; - const int32_t *cospi = cospi_arr(bit); - const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - - const __m128i cospi1 = _mm_set1_epi32(cospi[1]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospi3 = _mm_set1_epi32(cospi[3]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospi5 = _mm_set1_epi32(cospi[5]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi7 = _mm_set1_epi32(cospi[7]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospi9 = _mm_set1_epi32(cospi[9]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi11 = _mm_set1_epi32(cospi[11]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi13 = _mm_set1_epi32(cospi[13]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi15 = _mm_set1_epi32(cospi[15]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospi17 = _mm_set1_epi32(cospi[17]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi19 = _mm_set1_epi32(cospi[19]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi21 = _mm_set1_epi32(cospi[21]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi23 = _mm_set1_epi32(cospi[23]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi25 = _mm_set1_epi32(cospi[25]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi27 = _mm_set1_epi32(cospi[27]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi29 = _mm_set1_epi32(cospi[29]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi31 = _mm_set1_epi32(cospi[31]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospi35 = _mm_set1_epi32(cospi[35]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi39 = _mm_set1_epi32(cospi[39]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi43 = _mm_set1_epi32(cospi[43]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi47 = _mm_set1_epi32(cospi[47]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospi51 = _mm_set1_epi32(cospi[51]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi55 = _mm_set1_epi32(cospi[55]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi59 = _mm_set1_epi32(cospi[59]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi63 = _mm_set1_epi32(cospi[63]); - - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); - const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); - const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); - const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); - const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); - const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); - - { - __m128i u[64], v[64]; - - // stage 1 - u[32] = in[1]; - u[34] = in[17]; - u[36] = in[9]; - u[38] = in[25]; - u[40] = in[5]; - u[42] = in[21]; - u[44] = in[13]; - u[46] = in[29]; - u[48] = in[3]; - u[50] = in[19]; - u[52] = in[11]; - u[54] = in[27]; - u[56] = in[7]; - u[58] = in[23]; - u[60] = in[15]; - u[62] = in[31]; - - v[16] = in[2]; - v[18] = in[18]; - v[20] = in[10]; - v[22] = in[26]; - v[24] = in[6]; - v[26] = in[22]; - v[28] = in[14]; - v[30] = in[30]; - - u[8] = in[4]; - u[10] = in[20]; - u[12] = in[12]; - u[14] = in[28]; - - v[4] = in[8]; - v[6] = in[24]; - - u[0] = in[0]; - u[2] = in[16]; - - // stage 2 - v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); - v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); - v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); - v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); - v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); - v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); - v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); - v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); - v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); - v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); - v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); - v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); - v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); - v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); - v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); - v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); - v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); - v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); - v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); - v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); - v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); - v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); - v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); - v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); - v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); - v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); - v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); - v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); - v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); - v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); - v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); - v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); - - // stage 3 - u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); - u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); - u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); - u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); - u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); - u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); - u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); - u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); - u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); - u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); - u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); - u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); - u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); - u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); - u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); - u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); - - for (i = 32; i < 64; i += 4) { - addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, - &clamp_hi); - } - - // stage 4 - v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); - v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); - v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); - v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); - v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); - v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); - v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); - v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); - - for (i = 16; i < 32; i += 4) { - addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, - &clamp_hi); - } - - for (i = 32; i < 64; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; - } - - v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); - v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); - v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); - v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); - v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); - v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); - - // stage 5 - u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); - u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); - u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); - u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); - - for (i = 8; i < 16; i += 4) { - addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, - &clamp_hi); - } - - for (i = 16; i < 32; i += 4) { - u[i + 0] = v[i + 0]; - u[i + 3] = v[i + 3]; - } - - u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); - u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); - u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); - u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); - u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); - - for (i = 32; i < 64; i += 8) { - addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, - &clamp_hi); - - addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, - &clamp_hi); - } - - // stage 6 - v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); - v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); - v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); - - addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); - - for (i = 8; i < 16; i += 4) { - v[i + 0] = u[i + 0]; - v[i + 3] = u[i + 3]; - } - - v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); - v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); - v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); - v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); - - for (i = 16; i < 32; i += 8) { - addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, - &clamp_hi); - - addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, - &clamp_hi); - addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, - &clamp_hi); - } - - for (i = 32; i < 64; i += 8) { - v[i + 0] = u[i + 0]; - v[i + 1] = u[i + 1]; - v[i + 6] = u[i + 6]; - v[i + 7] = u[i + 7]; - } - - v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); - v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); - v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); - v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); - v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); - v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); - v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); - - // stage 7 - addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); - - u[4] = v[4]; - u[7] = v[7]; - u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); - u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); - - addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); - - for (i = 16; i < 32; i += 8) { - u[i + 0] = v[i + 0]; - u[i + 1] = v[i + 1]; - u[i + 6] = v[i + 6]; - u[i + 7] = v[i + 7]; - } - - u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); - u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); - u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); - u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); - u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); - u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); - - for (i = 32; i < 64; i += 16) { - for (j = i; j < i + 4; j++) { - addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, - &clamp_hi); - } - } - - // stage 8 - for (i = 0; i < 4; ++i) { - addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); - } - - v[8] = u[8]; - v[9] = u[9]; - v[14] = u[14]; - v[15] = u[15]; - - v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); - v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); - v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); - v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); - - for (i = 16; i < 20; ++i) { - addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); - addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, - &clamp_hi); - } - - for (i = 32; i < 36; ++i) { - v[i] = u[i]; - v[i + 12] = u[i + 12]; - v[i + 16] = u[i + 16]; - v[i + 28] = u[i + 28]; - } - - v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); - v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); - v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); - v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); - v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); - v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); - v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); - v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); - v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); - v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); - - // stage 9 - for (i = 0; i < 8; ++i) { - addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); - } - - for (i = 16; i < 20; ++i) { - u[i] = v[i]; - u[i + 12] = v[i + 12]; - } - - u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); - u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); - u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); - u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); - u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); - u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); - u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); - u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); - - for (i = 32; i < 40; i++) { - addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); - } - - for (i = 48; i < 56; i++) { - addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); - } - - // stage 10 - for (i = 0; i < 16; i++) { - addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); - } - - for (i = 32; i < 40; i++) v[i] = u[i]; - - v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); - v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); - v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); - v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); - v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); - v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); - v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); - v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); - v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); - v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); - v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); - v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); - v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); - v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); - v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); - v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); - - for (i = 56; i < 64; i++) v[i] = u[i]; - - // stage 11 - if (do_cols) { - for (i = 0; i < 32; i++) { - addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]); - } - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - for (i = 0; i < 32; i++) { - addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], - &clamp_lo_out, &clamp_hi_out, out_shift); - } - } - } -} - -static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i bf1; - - // stage 0 - // stage 1 - bf1 = in[0]; - - // stage 2 - // stage 3 - // stage 4 - // stage 5 - bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); - - // stage 6 - // stage 7 - // stage 8 - // stage 9 - if (do_cols) { - bf1 = _mm_max_epi32(bf1, clamp_lo); - bf1 = _mm_min_epi32(bf1, clamp_hi); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); - bf1 = _mm_add_epi32(bf1, offset); - bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); - bf1 = _mm_max_epi32(bf1, clamp_lo_out); - bf1 = _mm_min_epi32(bf1, clamp_hi_out); - } - out[0] = bf1; - out[1] = bf1; - out[2] = bf1; - out[3] = bf1; - out[4] = bf1; - out[5] = bf1; - out[6] = bf1; - out[7] = bf1; - out[8] = bf1; - out[9] = bf1; - out[10] = bf1; - out[11] = bf1; - out[12] = bf1; - out[13] = bf1; - out[14] = bf1; - out[15] = bf1; - out[16] = bf1; - out[17] = bf1; - out[18] = bf1; - out[19] = bf1; - out[20] = bf1; - out[21] = bf1; - out[22] = bf1; - out[23] = bf1; - out[24] = bf1; - out[25] = bf1; - out[26] = bf1; - out[27] = bf1; - out[28] = bf1; - out[29] = bf1; - out[30] = bf1; - out[31] = bf1; -} - -static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i bf1[32]; - - // stage 0 - // stage 1 - bf1[0] = in[0]; - bf1[4] = in[4]; - bf1[8] = in[2]; - bf1[12] = in[6]; - bf1[16] = in[1]; - bf1[20] = in[5]; - bf1[24] = in[3]; - bf1[28] = in[7]; - - // stage 2 - bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); - bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); - bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); - bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); - bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); - bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); - bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); - bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); - - // stage 3 - bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); - bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); - - bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); - bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); - bf1[17] = bf1[16]; - bf1[18] = bf1[19]; - bf1[21] = bf1[20]; - bf1[22] = bf1[23]; - bf1[25] = bf1[24]; - bf1[26] = bf1[27]; - bf1[29] = bf1[28]; - bf1[30] = bf1[31]; - - // stage 4 : - bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); - bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); - - bf1[9] = bf1[8]; - bf1[10] = bf1[11]; - bf1[13] = bf1[12]; - bf1[14] = bf1[15]; - - idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, - &cospi24, &cospi40, &cospim24, &rounding, bit); - - // stage 5 - bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); - bf1[1] = bf1[0]; - bf1[5] = bf1[4]; - bf1[6] = bf1[7]; - - idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, - &clamp_hi, &rounding, bit); - - // stage 6 - bf1[3] = bf1[0]; - bf1[2] = bf1[1]; - - idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - - // stage 7 - idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 8 - idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 9 - idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); -} - -static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i bf1[32]; - - // stage 0 - // stage 1 - - bf1[0] = in[0]; - bf1[2] = in[8]; - bf1[4] = in[4]; - bf1[6] = in[12]; - bf1[8] = in[2]; - bf1[10] = in[10]; - bf1[12] = in[6]; - bf1[14] = in[14]; - bf1[16] = in[1]; - bf1[18] = in[9]; - bf1[20] = in[5]; - bf1[22] = in[13]; - bf1[24] = in[3]; - bf1[26] = in[11]; - bf1[28] = in[7]; - bf1[30] = in[15]; - - // stage 2 - bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); - bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); - bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); - bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); - bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); - bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); - bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); - bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); - bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); - bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); - bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); - bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); - bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); - bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); - bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); - bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); - - // stage 3 - bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); - bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); - bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); - bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); - bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); - bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); - bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); - bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); - - addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - // stage 4 - bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); - bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); - bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); - bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); - - addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); - - idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, - &cospi24, &cospi40, &cospim24, &rounding, bit); - - // stage 5 - bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); - bf1[1] = bf1[0]; - bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); - bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); - - addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - - idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, - &clamp_hi, &rounding, bit); - - // stage 6 - addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); - - idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, - &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); - - // stage 7 - idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 8 - idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, - &rounding, bit); - - // stage 9 - idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range); -} - -static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, - int bd, int out_shift) { - const int32_t *cospi = cospi_arr(bit); - const __m128i cospi62 = _mm_set1_epi32(cospi[62]); - const __m128i cospi30 = _mm_set1_epi32(cospi[30]); - const __m128i cospi46 = _mm_set1_epi32(cospi[46]); - const __m128i cospi14 = _mm_set1_epi32(cospi[14]); - const __m128i cospi54 = _mm_set1_epi32(cospi[54]); - const __m128i cospi22 = _mm_set1_epi32(cospi[22]); - const __m128i cospi38 = _mm_set1_epi32(cospi[38]); - const __m128i cospi6 = _mm_set1_epi32(cospi[6]); - const __m128i cospi58 = _mm_set1_epi32(cospi[58]); - const __m128i cospi26 = _mm_set1_epi32(cospi[26]); - const __m128i cospi42 = _mm_set1_epi32(cospi[42]); - const __m128i cospi10 = _mm_set1_epi32(cospi[10]); - const __m128i cospi50 = _mm_set1_epi32(cospi[50]); - const __m128i cospi18 = _mm_set1_epi32(cospi[18]); - const __m128i cospi34 = _mm_set1_epi32(cospi[34]); - const __m128i cospi2 = _mm_set1_epi32(cospi[2]); - const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); - const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); - const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); - const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); - const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); - const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); - const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); - const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); - const __m128i cospi60 = _mm_set1_epi32(cospi[60]); - const __m128i cospi28 = _mm_set1_epi32(cospi[28]); - const __m128i cospi44 = _mm_set1_epi32(cospi[44]); - const __m128i cospi12 = _mm_set1_epi32(cospi[12]); - const __m128i cospi52 = _mm_set1_epi32(cospi[52]); - const __m128i cospi20 = _mm_set1_epi32(cospi[20]); - const __m128i cospi36 = _mm_set1_epi32(cospi[36]); - const __m128i cospi4 = _mm_set1_epi32(cospi[4]); - const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); - const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); - const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); - const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); - const __m128i cospi56 = _mm_set1_epi32(cospi[56]); - const __m128i cospi24 = _mm_set1_epi32(cospi[24]); - const __m128i cospi40 = _mm_set1_epi32(cospi[40]); - const __m128i cospi8 = _mm_set1_epi32(cospi[8]); - const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); - const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); - const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); - const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); - const __m128i cospi32 = _mm_set1_epi32(cospi[32]); - const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); - const __m128i cospi48 = _mm_set1_epi32(cospi[48]); - const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); - const __m128i cospi16 = _mm_set1_epi32(cospi[16]); - const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); - const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); - const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); - const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); - const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); - __m128i bf1[32], bf0[32]; - - // stage 0 - // stage 1 - bf1[0] = in[0]; - bf1[1] = in[16]; - bf1[2] = in[8]; - bf1[3] = in[24]; - bf1[4] = in[4]; - bf1[5] = in[20]; - bf1[6] = in[12]; - bf1[7] = in[28]; - bf1[8] = in[2]; - bf1[9] = in[18]; - bf1[10] = in[10]; - bf1[11] = in[26]; - bf1[12] = in[6]; - bf1[13] = in[22]; - bf1[14] = in[14]; - bf1[15] = in[30]; - bf1[16] = in[1]; - bf1[17] = in[17]; - bf1[18] = in[9]; - bf1[19] = in[25]; - bf1[20] = in[5]; - bf1[21] = in[21]; - bf1[22] = in[13]; - bf1[23] = in[29]; - bf1[24] = in[3]; - bf1[25] = in[19]; - bf1[26] = in[11]; - bf1[27] = in[27]; - bf1[28] = in[7]; - bf1[29] = in[23]; - bf1[30] = in[15]; - bf1[31] = in[31]; - - // stage 2 - bf0[0] = bf1[0]; - bf0[1] = bf1[1]; - bf0[2] = bf1[2]; - bf0[3] = bf1[3]; - bf0[4] = bf1[4]; - bf0[5] = bf1[5]; - bf0[6] = bf1[6]; - bf0[7] = bf1[7]; - bf0[8] = bf1[8]; - bf0[9] = bf1[9]; - bf0[10] = bf1[10]; - bf0[11] = bf1[11]; - bf0[12] = bf1[12]; - bf0[13] = bf1[13]; - bf0[14] = bf1[14]; - bf0[15] = bf1[15]; - bf0[16] = - half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); - bf0[17] = - half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); - bf0[18] = - half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); - bf0[19] = - half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); - bf0[20] = - half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); - bf0[23] = - half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); - bf0[24] = - half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); - bf0[25] = - half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); - bf0[28] = - half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); - bf0[29] = - half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); - bf0[30] = - half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); - bf0[31] = - half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); - - // stage 3 - bf1[0] = bf0[0]; - bf1[1] = bf0[1]; - bf1[2] = bf0[2]; - bf1[3] = bf0[3]; - bf1[4] = bf0[4]; - bf1[5] = bf0[5]; - bf1[6] = bf0[6]; - bf1[7] = bf0[7]; - bf1[8] = - half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); - bf1[9] = - half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); - bf1[10] = - half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); - bf1[11] = - half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); - bf1[12] = - half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); - bf1[13] = - half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); - bf1[14] = - half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); - bf1[15] = - half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); - - addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); - - // stage 4 - bf0[0] = bf1[0]; - bf0[1] = bf1[1]; - bf0[2] = bf1[2]; - bf0[3] = bf1[3]; - bf0[4] = - half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); - bf0[5] = - half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); - bf0[6] = - half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); - bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); - - addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); - - bf0[16] = bf1[16]; - bf0[17] = - half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); - bf0[18] = - half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); - bf0[19] = bf1[19]; - bf0[20] = bf1[20]; - bf0[21] = - half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); - bf0[23] = bf1[23]; - bf0[24] = bf1[24]; - bf0[25] = - half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); - bf0[27] = bf1[27]; - bf0[28] = bf1[28]; - bf0[29] = - half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); - bf0[30] = - half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); - bf0[31] = bf1[31]; - - // stage 5 - bf1[0] = - half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); - bf1[1] = - half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); - bf1[2] = - half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); - bf1[3] = - half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); - addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); - bf1[8] = bf0[8]; - bf1[9] = - half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); - bf1[10] = - half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); - bf1[11] = bf0[11]; - bf1[12] = bf0[12]; - bf1[13] = - half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); - bf1[14] = - half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); - bf1[15] = bf0[15]; - addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); - - // stage 6 - addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); - bf0[4] = bf1[4]; - bf0[5] = - half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); - bf0[6] = - half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); - bf0[7] = bf1[7]; - addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); - bf0[16] = bf1[16]; - bf0[17] = bf1[17]; - bf0[18] = - half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); - bf0[19] = - half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); - bf0[20] = - half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); - bf0[22] = bf1[22]; - bf0[23] = bf1[23]; - bf0[24] = bf1[24]; - bf0[25] = bf1[25]; - bf0[26] = - half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); - bf0[28] = - half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); - bf0[29] = - half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); - bf0[30] = bf1[30]; - bf0[31] = bf1[31]; - - // stage 7 - addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); - bf1[8] = bf0[8]; - bf1[9] = bf0[9]; - bf1[10] = - half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); - bf1[11] = - half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); - bf1[12] = - half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); - bf1[13] = - half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); - bf1[14] = bf0[14]; - bf1[15] = bf0[15]; - addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); - - // stage 8 - addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); - addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); - bf0[16] = bf1[16]; - bf0[17] = bf1[17]; - bf0[18] = bf1[18]; - bf0[19] = bf1[19]; - bf0[20] = - half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); - bf0[21] = - half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); - bf0[22] = - half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); - bf0[23] = - half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); - bf0[24] = - half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); - bf0[25] = - half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); - bf0[26] = - half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); - bf0[27] = - half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); - bf0[28] = bf1[28]; - bf0[29] = bf1[29]; - bf0[30] = bf1[30]; - bf0[31] = bf1[31]; - - // stage 9 - if (do_cols) { - addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31); - addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30); - addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29); - addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28); - addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27); - addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26); - addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25); - addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24); - addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23); - addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22); - addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21); - addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20); - addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19); - addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18); - addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17); - addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16); - } else { - const int log_range_out = AOMMAX(16, bd + 6); - const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( - -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); - const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( - (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); - - addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out, - &clamp_hi_out, out_shift); - addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out, - &clamp_hi_out, out_shift); - } -} - -void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); - int eob = txfm_param->eob; - int bd = txfm_param->bd; - int lossless = txfm_param->lossless; - const int32_t *src = cast_to_int32(input); - const TX_TYPE tx_type = txfm_param->tx_type; - if (lossless) { - assert(tx_type == DCT_DCT); - av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); - return; - } - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } -} - -static const transform_1d_sse4_1 - highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { - { - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, - { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { - { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, - NULL }, - { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, - NULL }, - { NULL, NULL, NULL, NULL }, - }, - { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, - idct32x32_sse4_1 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, - { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, - idct64x64_sse4_1 }, - { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } } - }; - -static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, - uint16_t *output, - int stride, TX_TYPE tx_type, - TX_SIZE tx_size, int eob, - const int bd) { - __m128i buf1[64 * 16]; - int eobx, eoby; - get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); - const int8_t *shift = inv_txfm_shift_ls[tx_size]; - const int txw_idx = get_txw_idx(tx_size); - const int txh_idx = get_txh_idx(tx_size); - const int txfm_size_col = tx_size_wide[tx_size]; - const int txfm_size_row = tx_size_high[tx_size]; - const int buf_size_w_div8 = txfm_size_col >> 2; - const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; - const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; - const int input_stride = AOMMIN(32, txfm_size_col); - const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); - - const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; - const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; - const transform_1d_sse4_1 row_txfm = - highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; - const transform_1d_sse4_1 col_txfm = - highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; - - assert(col_txfm != NULL); - assert(row_txfm != NULL); - int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); - - // 1st stage: column transform - for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { - __m128i buf0[64]; - const int32_t *input_row = input + i * input_stride * 4; - for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { - __m128i *buf0_cur = buf0 + j * 4; - load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); - - TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], - buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); - } - if (rect_type == 1 || rect_type == -1) { - av1_round_shift_rect_array_32_sse4_1( - buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2); - } - row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); - - __m128i *_buf1 = buf1 + i * 4; - if (lr_flip) { - for (int j = 0; j < buf_size_w_div8; ++j) { - TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], - buf0[4 * j], - _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], - _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], - _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], - _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); - } - } else { - for (int j = 0; j < buf_size_w_div8; ++j) { - TRANSPOSE_4X4( - buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], - _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], - _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); - } - } - } - // 2nd stage: column transform - for (int i = 0; i < buf_size_w_div8; i++) { - col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, - inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); - - av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, - buf1 + i * txfm_size_row, txfm_size_row, - -shift[1]); - } - - // write to buffer - { - for (int i = 0; i < (txfm_size_col >> 3); i++) { - highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, - output + 8 * i, stride, ud_flip, - txfm_size_row, bd); - } - } -} - -void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, - uint8_t *output, int stride, - TX_TYPE tx_type, TX_SIZE tx_size, - int eob, const int bd) { - switch (tx_type) { - case DCT_DCT: - case ADST_DCT: - case DCT_ADST: - case ADST_ADST: - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - highbd_inv_txfm2d_add_no_identity_sse41( - input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, - bd); - break; - default: assert(0); break; - } -} - -void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); - const TX_SIZE tx_size = txfm_param->tx_size; - switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_8X8: - av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_4X8: - av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param); - break; - case TX_8X4: - av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param); - break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param); - break; - case TX_32X64: - av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param); - break; - case TX_64X32: - av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param); - break; - case TX_4X4: - av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X4: - av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param); - break; - case TX_4X16: - av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param); - break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param); - break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param); - break; - case TX_64X64: - case TX_16X64: - case TX_64X16: - av1_highbd_inv_txfm2d_add_universe_sse4_1( - input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, - txfm_param->eob, txfm_param->bd); - break; - default: assert(0 && "Invalid transform size"); break; - } -} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c deleted file mode 100644 index e298cf653..000000000 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c +++ /dev/null @@ -1,846 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/convolve_common_intrin.h" -#include "aom_dsp/x86/convolve_sse4_1.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "av1/common/convolve.h" - -void av1_highbd_jnt_convolve_2d_copy_avx2( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - - const int bits = - FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; - const __m128i left_shift = _mm_cvtsi32_si128(bits); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi32(w0); - const __m256i wt1 = _mm256_set1_epi32(w1); - const __m256i zero = _mm256_setzero_si256(); - int i, j; - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi32(offset); - const __m256i offset_const_16b = _mm256_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); - const __m256i clip_pixel_to_bd = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - assert(bits <= 4); - - if (!(w % 16)) { - for (i = 0; i < h; i += 1) { - for (j = 0; j < w; j += 16) { - const __m256i src_16bit = - _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j])); - - const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); - - if (do_average) { - const __m256i data_0 = - _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); - - const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero); - const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero); - - const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); - const __m256i res_unsigned_lo = - _mm256_add_epi32(res_32b_lo, offset_const); - - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); - const __m256i res_unsigned_hi = - _mm256_add_epi32(res_32b_hi, offset_const); - - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result_lo = highbd_convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m256i round_result_hi = highbd_convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result_lo, round_result_hi); - const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - const __m256i res_unsigned_16b = - _mm256_adds_epu16(res, offset_const_16b); - - _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), - res_unsigned_16b); - } - } - } - } else if (!(w % 4)) { - for (i = 0; i < h; i += 2) { - for (j = 0; j < w; j += 8) { - const __m128i src_row_0 = - _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); - const __m128i src_row_1 = - _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride])); - // since not all compilers yet support _mm256_set_m128i() - const __m256i src_10 = _mm256_insertf128_si256( - _mm256_castsi128_si256(src_row_0), src_row_1, 1); - - const __m256i res = _mm256_sll_epi16(src_10, left_shift); - - if (w - j < 8) { - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - - const __m256i res_32b = _mm256_unpacklo_epi16(res, zero); - const __m256i res_unsigned_lo = - _mm256_add_epi32(res_32b, offset_const); - - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result = highbd_convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result, round_result); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - const __m256i res_unsigned_16b = - _mm256_adds_epu16(res, offset_const_16b); - - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); - - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); - const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - - const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); - const __m256i res_unsigned_lo = - _mm256_add_epi32(res_32b_lo, offset_const); - - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); - const __m256i res_unsigned_hi = - _mm256_add_epi32(res_32b_hi, offset_const); - - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result_lo = - highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, - &rounding_const, rounding_shift); - const __m256i round_result_hi = - highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, - &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result_lo, round_result_hi); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - const __m256i res_unsigned_16b = - _mm256_adds_epu16(res, offset_const_16b); - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); - - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - } - } - } -} - -void av1_highbd_jnt_convolve_2d_avx2( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - int im_h = h + filter_params_y->taps - 1; - int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - // Check that, even with 12-bit input, the intermediate values will fit - // into an unsigned 16-bit intermediate array. - assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); - - __m256i s[8], coeffs_y[4], coeffs_x[4]; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi32(w0); - const __m256i wt1 = _mm256_set1_epi32(w1); - const __m256i zero = _mm256_setzero_si256(); - - const __m256i round_const_x = _mm256_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - - const __m256i round_const_y = _mm256_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); - - const __m256i clip_pixel_to_bd = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { - for (i = 0; i < im_h; i += 2) { - const __m256i row0 = - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); - __m256i row1 = _mm256_set1_epi16(0); - if (i + 1 < im_h) - row1 = - _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); - - const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); - const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); - - // even pixels - s[0] = _mm256_alignr_epi8(r1, r0, 0); - s[1] = _mm256_alignr_epi8(r1, r0, 4); - s[2] = _mm256_alignr_epi8(r1, r0, 8); - s[3] = _mm256_alignr_epi8(r1, r0, 12); - - __m256i res_even = convolve(s, coeffs_x); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm256_alignr_epi8(r1, r0, 2); - s[1] = _mm256_alignr_epi8(r1, r0, 6); - s[2] = _mm256_alignr_epi8(r1, r0, 10); - s[3] = _mm256_alignr_epi8(r1, r0, 14); - - __m256i res_odd = convolve(s, coeffs_x); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), - round_shift_x); - - __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); - __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); - __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); - - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - } - - /* Vertical filter */ - { - __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); - __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); - __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); - __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - - s[0] = _mm256_unpacklo_epi16(s0, s1); - s[1] = _mm256_unpacklo_epi16(s2, s3); - s[2] = _mm256_unpacklo_epi16(s4, s5); - - s[4] = _mm256_unpackhi_epi16(s0, s1); - s[5] = _mm256_unpackhi_epi16(s2, s3); - s[6] = _mm256_unpackhi_epi16(s4, s5); - - for (i = 0; i < h; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); - - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); - - const __m256i res_a = convolve(s, coeffs_y); - - const __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_y), round_shift_y); - - const __m256i res_unsigned_lo = - _mm256_add_epi32(res_a_round, offset_const); - - if (w - j < 8) { - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result = highbd_convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result, round_result); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - const __m256i res_b = convolve(s + 4, coeffs_y); - const __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_y), round_shift_y); - - __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); - - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); - const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result_lo = - highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, - &rounding_const, rounding_shift); - const __m256i round_result_hi = - highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, - &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result_lo, round_result_hi); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} - -void av1_highbd_jnt_convolve_x_avx2( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_1; - (void)filter_params_y; - (void)subpel_y_q4; - - int i, j; - __m256i s[4], coeffs_x[4]; - - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi32(w0); - const __m256i wt1 = _mm256_set1_epi32(w1); - const __m256i zero = _mm256_setzero_si256(); - - const __m256i round_const_x = - _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); - const __m256i clip_pixel_to_bd = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - assert(bits >= 0); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - for (i = 0; i < h; i += 2) { - const __m256i row0 = - _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); - __m256i row1 = - _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); - - const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); - const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); - - // even pixels - s[0] = _mm256_alignr_epi8(r1, r0, 0); - s[1] = _mm256_alignr_epi8(r1, r0, 4); - s[2] = _mm256_alignr_epi8(r1, r0, 8); - s[3] = _mm256_alignr_epi8(r1, r0, 12); - - __m256i res_even = convolve(s, coeffs_x); - res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), - round_shift_x); - - // odd pixels - s[0] = _mm256_alignr_epi8(r1, r0, 2); - s[1] = _mm256_alignr_epi8(r1, r0, 6); - s[2] = _mm256_alignr_epi8(r1, r0, 10); - s[3] = _mm256_alignr_epi8(r1, r0, 14); - - __m256i res_odd = convolve(s, coeffs_x); - res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), - round_shift_x); - - res_even = _mm256_sll_epi32(res_even, round_shift_bits); - res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); - - __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); - - __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const); - - if (w - j < 8) { - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result = highbd_convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result, round_result); - const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); - __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const); - - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); - const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result_lo = highbd_convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m256i round_result_hi = highbd_convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result_lo, round_result_hi); - const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), - res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - } - } -} - -void av1_highbd_jnt_convolve_y_avx2( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride; - const int bits = FILTER_BITS - conv_params->round_0; - (void)filter_params_x; - (void)subpel_x_q4; - - assert(bits >= 0); - int i, j; - __m256i s[8], coeffs_y[4]; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi32(w0); - const __m256i wt1 = _mm256_set1_epi32(w1); - const __m256i round_const_y = - _mm256_set1_epi32(((1 << conv_params->round_1) >> 1)); - const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); - const __m256i clip_pixel_to_bd = - _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m256i zero = _mm256_setzero_si256(); - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - const uint16_t *data = &src_ptr[j]; - /* Vertical filter */ - { - __m256i src6; - __m256i s01 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - __m256i s12 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - __m256i s23 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - __m256i s34 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - __m256i s45 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - __m256i s56 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi16(s01, s12); - s[1] = _mm256_unpacklo_epi16(s23, s34); - s[2] = _mm256_unpacklo_epi16(s45, s56); - - s[4] = _mm256_unpackhi_epi16(s01, s12); - s[5] = _mm256_unpackhi_epi16(s23, s34); - s[6] = _mm256_unpackhi_epi16(s45, s56); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - - const __m256i s67 = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - - const __m256i s78 = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); - - s[3] = _mm256_unpacklo_epi16(s67, s78); - s[7] = _mm256_unpackhi_epi16(s67, s78); - - const __m256i res_a = convolve(s, coeffs_y); - - __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits); - res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a_round, round_const_y), round_shift_y); - - __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); - - if (w - j < 8) { - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result = highbd_convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result, round_result); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - const __m256i res_b = convolve(s + 4, coeffs_y); - __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits); - res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b_round, round_const_y), round_shift_y); - - __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); - - if (do_average) { - const __m256i data_0 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); - const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); - const __m256i data_01 = - _mm256_permute2x128_si256(data_0, data_1, 0x20); - - const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); - const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m256i round_result_lo = - highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, - &rounding_const, rounding_shift); - const __m256i round_result_hi = - highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, - &rounding_const, rounding_shift); - - const __m256i res_16b = - _mm256_packus_epi32(round_result_lo, round_result_hi); - const __m256i res_clip = - _mm256_min_epi16(res_16b, clip_pixel_to_bd); - - const __m128i res_0 = _mm256_castsi256_si128(res_clip); - const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); - } else { - __m256i res_16b = - _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_16b); - const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); - - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c deleted file mode 100644 index 1a29985b5..000000000 --- a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve_sse2.h" -#include "aom_dsp/x86/convolve_sse4_1.h" - -void av1_highbd_jnt_convolve_y_sse4_1( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_vert * src_stride; - const int bits = FILTER_BITS - conv_params->round_0; - (void)filter_params_x; - (void)subpel_x_q4; - - assert(bits >= 0); - int i, j; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - const __m128i round_const_y = - _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); - const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); - const __m128i clip_pixel_to_bd = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m128i zero = _mm_setzero_si128(); - __m128i s[16], coeffs_y[4]; - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - for (j = 0; j < w; j += 8) { - const uint16_t *data = &src_ptr[j]; - /* Vertical filter */ - { - __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); - __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); - __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); - __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); - __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); - __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); - __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); - - s[0] = _mm_unpacklo_epi16(s0, s1); - s[1] = _mm_unpacklo_epi16(s2, s3); - s[2] = _mm_unpacklo_epi16(s4, s5); - - s[4] = _mm_unpackhi_epi16(s0, s1); - s[5] = _mm_unpackhi_epi16(s2, s3); - s[6] = _mm_unpackhi_epi16(s4, s5); - - s[0 + 8] = _mm_unpacklo_epi16(s1, s2); - s[1 + 8] = _mm_unpacklo_epi16(s3, s4); - s[2 + 8] = _mm_unpacklo_epi16(s5, s6); - - s[4 + 8] = _mm_unpackhi_epi16(s1, s2); - s[5 + 8] = _mm_unpackhi_epi16(s3, s4); - s[6 + 8] = _mm_unpackhi_epi16(s5, s6); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - - __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); - __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); - - s[3] = _mm_unpacklo_epi16(s6, s7); - s[7] = _mm_unpackhi_epi16(s6, s7); - - s[3 + 8] = _mm_unpacklo_epi16(s7, s8); - s[7 + 8] = _mm_unpackhi_epi16(s7, s8); - - const __m128i res_a0 = convolve(s, coeffs_y); - __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits); - res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y), - round_shift_y); - - const __m128i res_a1 = convolve(s + 8, coeffs_y); - __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits); - res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y), - round_shift_y); - - __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const); - __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const); - - if (w - j < 8) { - if (do_average) { - const __m128i data_0 = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_1 = _mm_loadl_epi64( - (__m128i *)(&dst[i * dst_stride + j + dst_stride])); - - const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); - const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); - - const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_0 = - highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, - &rounding_const, rounding_shift); - const __m128i round_result_1 = - highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const, - &rounding_const, rounding_shift); - - const __m128i res_16b_0 = - _mm_packus_epi32(round_result_0, round_result_0); - const __m128i res_clip_0 = - _mm_min_epi16(res_16b_0, clip_pixel_to_bd); - const __m128i res_16b_1 = - _mm_packus_epi32(round_result_1, round_result_1); - const __m128i res_clip_1 = - _mm_min_epi16(res_16b_1, clip_pixel_to_bd); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), - res_clip_0); - _mm_storel_epi64( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), - res_clip_1); - - } else { - __m128i res_16b_0 = - _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0); - - __m128i res_16b_1 = - _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1); - - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_16b_1); - } - } else { - const __m128i res_b0 = convolve(s + 4, coeffs_y); - __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits); - res_b_round0 = _mm_sra_epi32( - _mm_add_epi32(res_b_round0, round_const_y), round_shift_y); - - const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); - __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits); - res_b_round1 = _mm_sra_epi32( - _mm_add_epi32(res_b_round1, round_const_y), round_shift_y); - - __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const); - __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const); - - if (do_average) { - const __m128i data_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_1 = _mm_loadu_si128( - (__m128i *)(&dst[i * dst_stride + j + dst_stride])); - const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero); - const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero); - - const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero); - const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero); - - const __m128i comp_avg_res_lo_0 = - highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, - &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_lo_1 = - highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, - &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi_0 = - highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, - &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi_1 = - highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, - &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_lo_0 = - highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, - &rounding_const, rounding_shift); - const __m128i round_result_lo_1 = - highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const, - &rounding_const, rounding_shift); - const __m128i round_result_hi_0 = - highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const, - &rounding_const, rounding_shift); - const __m128i round_result_hi_1 = - highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const, - &rounding_const, rounding_shift); - - const __m128i res_16b_0 = - _mm_packus_epi32(round_result_lo_0, round_result_hi_0); - const __m128i res_clip_0 = - _mm_min_epi16(res_16b_0, clip_pixel_to_bd); - - const __m128i res_16b_1 = - _mm_packus_epi32(round_result_lo_1, round_result_hi_1); - const __m128i res_clip_1 = - _mm_min_epi16(res_16b_1, clip_pixel_to_bd); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), - res_clip_0); - _mm_store_si128( - (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), - res_clip_1); - } else { - __m128i res_16bit0 = - _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0); - __m128i res_16bit1 = - _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_16bit1); - } - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - - s[0 + 8] = s[1 + 8]; - s[1 + 8] = s[2 + 8]; - s[2 + 8] = s[3 + 8]; - - s[4 + 8] = s[5 + 8]; - s[5 + 8] = s[6 + 8]; - s[6 + 8] = s[7 + 8]; - - s6 = s8; - } - } - } -} - -void av1_highbd_jnt_convolve_x_sse4_1( - const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, - int h, const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, const int subpel_x_q4, - const int subpel_y_q4, ConvolveParams *conv_params, int bd) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint16_t *const src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_1; - (void)filter_params_y; - (void)subpel_y_q4; - - int i, j; - __m128i s[4], coeffs_x[4]; - - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - const __m128i zero = _mm_setzero_si128(); - - const __m128i round_const_x = - _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); - const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); - const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi32(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); - const __m128i clip_pixel_to_bd = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - - assert(bits >= 0); - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - for (i = 0; i < h; i += 1) { - const __m128i row00 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i row01 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); - - // even pixels - s[0] = _mm_alignr_epi8(row01, row00, 0); - s[1] = _mm_alignr_epi8(row01, row00, 4); - s[2] = _mm_alignr_epi8(row01, row00, 8); - s[3] = _mm_alignr_epi8(row01, row00, 12); - - __m128i res_even = convolve(s, coeffs_x); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); - - // odd pixels - s[0] = _mm_alignr_epi8(row01, row00, 2); - s[1] = _mm_alignr_epi8(row01, row00, 6); - s[2] = _mm_alignr_epi8(row01, row00, 10); - s[3] = _mm_alignr_epi8(row01, row00, 14); - - __m128i res_odd = convolve(s, coeffs_x); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); - - res_even = _mm_sll_epi32(res_even, round_shift_bits); - res_odd = _mm_sll_epi32(res_odd, round_shift_bits); - - __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const); - if (w - j < 8) { - if (do_average) { - const __m128i data_0 = - _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); - - const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i round_result = highbd_convolve_rounding_sse2( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_16b = _mm_packus_epi32(round_result, round_result); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b); - } - } else { - __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd); - __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const); - if (do_average) { - const __m128i data_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); - const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); - - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); - - const __m128i round_result_lo = highbd_convolve_rounding_sse2( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m128i round_result_hi = highbd_convolve_rounding_sse2( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_16b = - _mm_packus_epi32(round_result_lo, round_result_hi); - const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); - } else { - __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h deleted file mode 100644 index 6f24e5948..000000000 --- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ -#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ - -#include <smmintrin.h> /* SSE4.1 */ - -#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ - do { \ - __m128i u0, u1, u2, u3; \ - u0 = _mm_unpacklo_epi32(x0, x1); \ - u1 = _mm_unpackhi_epi32(x0, x1); \ - u2 = _mm_unpacklo_epi32(x2, x3); \ - u3 = _mm_unpackhi_epi32(x2, x3); \ - y0 = _mm_unpacklo_epi64(u0, u2); \ - y1 = _mm_unpackhi_epi64(u0, u2); \ - y2 = _mm_unpacklo_epi64(u1, u3); \ - y3 = _mm_unpackhi_epi64(u1, u3); \ - } while (0) - -static INLINE void transpose_8x8(const __m128i *in, __m128i *out) { - TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); - TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); - TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); - TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], - out[15]); -} - -static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { - // Upper left 8x8 - TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]); - TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24], - out[28]); - TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9], - out[13]); - TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25], - out[29]); - - // Upper right 8x8 - TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40], - out[44]); - TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56], - out[60]); - TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41], - out[45]); - TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57], - out[61]); - - // Lower left 8x8 - TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10], - out[14]); - TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26], - out[30]); - TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11], - out[15]); - TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27], - out[31]); - // Lower right 8x8 - TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42], - out[46]); - TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58], - out[62]); - TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43], - out[47]); - TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59], - out[63]); -} - -static INLINE void transpose_32x32(const __m128i *input, __m128i *output) { - for (int j = 0; j < 8; j++) { - for (int i = 0; i < 8; i++) { - TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8], - input[i * 32 + j + 16], input[i * 32 + j + 24], - output[j * 32 + i + 0], output[j * 32 + i + 8], - output[j * 32 + i + 16], output[j * 32 + i + 24]); - } - } -} - -// Note: -// rounding = 1 << (bit - 1) -static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, - const __m128i *w1, const __m128i *n1, - const __m128i *rounding, int bit) { - __m128i x, y; - - x = _mm_mullo_epi32(*w0, *n0); - y = _mm_mullo_epi32(*w1, *n1); - x = _mm_add_epi32(x, y); - x = _mm_add_epi32(x, *rounding); - x = _mm_srai_epi32(x, bit); - return x; -} - -static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, - const __m128i *rounding, int bit) { - __m128i x; - - x = _mm_mullo_epi32(*w0, *n0); - x = _mm_add_epi32(x, *rounding); - x = _mm_srai_epi32(x, bit); - return x; -} - -typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, - int do_cols, int bd, int out_shift); - -typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, - const int num_cols); - -void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, - uint8_t *output, int stride, - TX_TYPE tx_type, TX_SIZE tx_size, - int eob, const int bd); - -#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c deleted file mode 100644 index 4bcab0564..000000000 --- a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c +++ /dev/null @@ -1,624 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/warped_motion.h" - -static const uint8_t warp_highbd_arrange_bytes[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 -}; - -static const uint8_t highbd_shuffle_alpha0_mask0[16] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 -}; -static const uint8_t highbd_shuffle_alpha0_mask1[16] = { - 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 -}; -static const uint8_t highbd_shuffle_alpha0_mask2[16] = { - 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 -}; -static const uint8_t highbd_shuffle_alpha0_mask3[16] = { - 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 -}; - -static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, - __m128i *coeff) { - // Filter even-index pixels - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); - - // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 - const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 - const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 - const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 - const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 - coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); - // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 - coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); - // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 - coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); - // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 - coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); - - // Filter odd-index pixels - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); - coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); - coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); - coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); -} - -static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( - int sx, __m128i *coeff) { - // Filter coeff - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); - - coeff[0] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); - coeff[2] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); - coeff[4] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); - coeff[6] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); - - coeff[1] = coeff[0]; - coeff[3] = coeff[2]; - coeff[5] = coeff[4]; - coeff[7] = coeff[6]; -} - -static INLINE void highbd_filter_src_pixels( - const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, - const int offset_bits_horiz, const int reduce_bits_horiz, int k) { - const __m128i src_1 = *src; - const __m128i src2_1 = *src2; - - const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); - - const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); - - __m128i res_even = - _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); - res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), - _mm_cvtsi32_si128(reduce_bits_horiz)); - - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); - - __m128i res_odd = - _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); - res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), - _mm_cvtsi32_si128(reduce_bits_horiz)); - - // Combine results into one register. - // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7 - // as this order helps with the vertical filter. - tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); -} - -static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, - __m128i *tmp, int sx, int alpha, int k, - const int offset_bits_horiz, - const int reduce_bits_horiz) { - __m128i coeff[8]; - highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); - highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, - reduce_bits_horiz, k); -} - -static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( - const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)beta; - (void)alpha; - int k; - - __m128i coeff[8]; - highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); - - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, - reduce_bits_horiz, k); - } -} - -static INLINE void highbd_warp_horizontal_filter_alpha0( - const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)alpha; - int k; - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - - __m128i coeff[8]; - highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); - highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, - reduce_bits_horiz, k); - } -} - -static INLINE void highbd_warp_horizontal_filter_beta0( - const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)beta; - int k; - __m128i coeff[8]; - highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); - - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, - reduce_bits_horiz, k); - } -} - -static INLINE void highbd_warp_horizontal_filter( - const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - int k; - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - - highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } -} - -static INLINE void highbd_prepare_warp_horizontal_filter( - const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - if (alpha == 0 && beta == 0) - highbd_warp_horizontal_filter_alpha0_beta0( - ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - - else if (alpha == 0 && beta != 0) - highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, - beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - - else if (alpha != 0 && beta == 0) - highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, - beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - else - highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, - p_height, height, i, offset_bits_horiz, - reduce_bits_horiz); -} - -void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, - int width, int height, int stride, - uint16_t *pred, int p_col, int p_row, - int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, int bd, - ConvolveParams *conv_params, int16_t alpha, - int16_t beta, int16_t gamma, int16_t delta) { - __m128i tmp[15]; - int i, j, k; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); - const int reduce_bits_vert = conv_params->is_compound - ? conv_params->round_1 - : 2 * FILTER_BITS - reduce_bits_horiz; - const int offset_bits_horiz = bd + FILTER_BITS - 1; - assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); - assert(!(bd == 12 && reduce_bits_horiz < 5)); - assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); - - const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; - const __m128i clip_pixel = - _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); - const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); - const __m128i reduce_bits_vert_const = - _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); - const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const __m128i res_sub_const = - _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - - (1 << (offset_bits - conv_params->round_1 - 1))); - __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); - __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi32(w0); - const __m128i wt1 = _mm_set1_epi32(w1); - - /* Note: For this code to work, the left/right frame borders need to be - extended by at least 13 pixels each. By the time we get here, other - code will have set up this border, but we allow an explicit check - for debugging purposes. - */ - /*for (i = 0; i < height; ++i) { - for (j = 0; j < 13; ++j) { - assert(ref[i * stride - 13 + j] == ref[i * stride]); - assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); - } - }*/ - - for (i = 0; i < p_height; i += 8) { - for (j = 0; j < p_width; j += 8) { - const int32_t src_x = (p_col + j + 4) << subsampling_x; - const int32_t src_y = (p_row + i + 4) << subsampling_y; - const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; - const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; - const int32_t x4 = dst_x >> subsampling_x; - const int32_t y4 = dst_y >> subsampling_y; - - int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; - int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; - int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - - // Add in all the constant terms, including rounding and offset - sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); - sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); - - sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - - // Horizontal filter - // If the block is aligned such that, after clamping, every sample - // would be taken from the leftmost/rightmost column, then we can - // skip the expensive horizontal filter. - if (ix4 <= -7) { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); - } - } else if (ix4 >= width + 6) { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - tmp[k + 7] = - _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride + (width - 1)] * - (1 << (FILTER_BITS - reduce_bits_horiz))); - } - } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; - - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - const __m128i src2 = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); - - const __m128i src_01 = _mm_shuffle_epi8( - src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); - const __m128i src2_01 = _mm_shuffle_epi8( - src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); - - __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01); - __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01); - - if (out_of_boundary_left >= 0) { - const __m128i shuffle_reg_left = - _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); - src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left); - src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left); - } - - if (out_of_boundary_right >= 0) { - const __m128i shuffle_reg_right = _mm_loadu_si128( - (__m128i *)warp_pad_right[out_of_boundary_right]); - src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right); - src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right); - } - - const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); - const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); - - highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, - offset_bits_horiz, reduce_bits_horiz); - } - } else { - highbd_prepare_warp_horizontal_filter( - ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - } - - // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + delta * (k + 4); - - // Load from tmp and rearrange pairs of consecutive rows into the - // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - const __m128i *src = tmp + (k + 4); - const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); - - // Filter even-index pixels - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); - const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); - const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); - const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + - ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); - const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); - const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); - const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - if (conv_params->is_compound) { - __m128i *const p = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j]; - res_lo = _mm_add_epi32(res_lo, res_add_const); - res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), - reduce_bits_vert_shift); - - if (conv_params->do_average) { - __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); - - if (conv_params->use_jnt_comp_avg) { - res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), - _mm_mullo_epi32(res_lo, wt1)); - res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); - } else { - res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1); - } - - __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const); - res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const), - round_bits_shift); - - __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo); - res16_lo = _mm_min_epi16(res16_lo, clip_pixel); - _mm_storel_epi64(dst16, res16_lo); - } else { - res_lo = _mm_packus_epi32(res_lo, res_lo); - _mm_storel_epi64(p, res_lo); - } - if (p_width > 4) { - __m128i *const p4 = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; - - res_hi = _mm_add_epi32(res_hi, res_add_const); - res_hi = - _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), - reduce_bits_vert_shift); - if (conv_params->do_average) { - __m128i *const dst16_4 = - (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; - __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); - - if (conv_params->use_jnt_comp_avg) { - res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), - _mm_mullo_epi32(res_hi, wt1)); - res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); - } else { - res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1); - } - - __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const); - res32_hi = _mm_sra_epi32( - _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift); - __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi); - res16_hi = _mm_min_epi16(res16_hi, clip_pixel); - _mm_storel_epi64(dst16_4, res16_hi); - } else { - res_hi = _mm_packus_epi32(res_hi, res_hi); - _mm_storel_epi64(p4, res_hi); - } - } - } else { - // Round and pack into 8 bits - const __m128i round_const = - _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + - ((1 << reduce_bits_vert) >> 1)); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), reduce_bits_vert); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), reduce_bits_vert); - - __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - // Clamp res_16bit to the range [0, 2^bd - 1] - const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); - const __m128i zero = _mm_setzero_si128(); - res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); - - // Store, blending with 'pred' if needed - __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - - // Note: If we're outputting a 4x4 block, we need to be very careful - // to only output 4 pixels at this point, to avoid encode/decode - // mismatches when encoding with multiple threads. - if (p_width == 4) { - _mm_storel_epi64(p, res_16bit); - } else { - _mm_storeu_si128(p, res_16bit); - } - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c deleted file mode 100644 index 0c8a8505b..000000000 --- a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "av1/common/convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" - -// 128-bit xmmwords are written as [ ... ] with the MSB on the left. -// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB -// on the left. -// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be -// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ]. -void av1_highbd_wiener_convolve_add_src_avx2( - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, - const ConvolveParams *conv_params, int bd) { - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); - (void)x_step_q4; - (void)y_step_q4; - - const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); - - DECLARE_ALIGNED(32, uint16_t, - temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero_128 = _mm_setzero_si128(); - const __m256i zero_256 = _mm256_setzero_si256(); - - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); - - const __m256i clamp_low = zero_256; - - /* Horizontal filter */ - { - const __m256i clamp_high_ep = - _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); - - // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] - const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); - - // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = _mm256_set1_epi32( - (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); - - for (int i = 0; i < intermediate_height; ++i) { - for (int j = 0; j < w; j += 16) { - const uint16_t *src_ij = src_ptr + i * src_stride + j; - - // Load 16-bit src data - const __m256i src_0 = yy_loadu_256(src_ij + 0); - const __m256i src_1 = yy_loadu_256(src_ij + 1); - const __m256i src_2 = yy_loadu_256(src_ij + 2); - const __m256i src_3 = yy_loadu_256(src_ij + 3); - const __m256i src_4 = yy_loadu_256(src_ij + 4); - const __m256i src_5 = yy_loadu_256(src_ij + 5); - const __m256i src_6 = yy_loadu_256(src_ij + 6); - const __m256i src_7 = yy_loadu_256(src_ij + 7); - - // Multiply src data by filter coeffs and sum pairs - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - // Calculate scalar product for even- and odd-indices separately, - // increasing to 32-bit precision - const __m256i res_even_sum = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); - const __m256i res_even = _mm256_srai_epi32( - _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); - - const __m256i res_odd_sum = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); - const __m256i res_odd = _mm256_srai_epi32( - _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); - - // Reduce to 16-bit precision and pack even- and odd-index results - // back into one register. The _mm256_packs_epi32 intrinsic returns - // a register with the pixels ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i res = _mm256_packs_epi32(res_even, res_odd); - const __m256i res_clamped = - _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep); - - // Store in a temporary array - yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); - } - } - } - - /* Vertical filter */ - { - const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1); - - // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] - const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); - - // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = - _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - - (1 << (bd + conv_params->round_1 - 1))); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j; - - // Load 16-bit data from the output of the horizontal filter in - // which the pixels are ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE); - const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE); - const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE); - const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE); - const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE); - const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE); - const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE); - const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE); - - // Filter the even-indices, increasing to 32-bit precision - const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); - const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); - const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); - const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); - - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - - const __m256i res_even = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); - - // Filter the odd-indices, increasing to 32-bit precision - const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); - const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); - const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); - const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); - - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - const __m256i res_odd = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); - - // Pixels are currently in the following order: - // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] - // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] - // - // Rearrange the pixels into the following order: - // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] - // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] - const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); - const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); - - const __m256i res_lo_round = _mm256_srai_epi32( - _mm256_add_epi32(res_lo, round_const), conv_params->round_1); - const __m256i res_hi_round = _mm256_srai_epi32( - _mm256_add_epi32(res_hi, round_const), conv_params->round_1); - - // Reduce to 16-bit precision and pack into the correct order: - // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] - const __m256i res_16bit = - _mm256_packs_epi32(res_lo_round, res_hi_round); - const __m256i res_16bit_clamped = _mm256_min_epi16( - _mm256_max_epi16(res_16bit, clamp_low), clamp_high); - - // Store in the dst array - yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c deleted file mode 100644 index 818b1099c..000000000 --- a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> -#include <assert.h> - -#include "config/aom_dsp_rtcd.h" - -#include "av1/common/convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -void av1_highbd_wiener_convolve_add_src_ssse3( - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, - const ConvolveParams *conv_params, int bd) { - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); - (void)x_step_q4; - (void)y_step_q4; - - const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); - - DECLARE_ALIGNED(16, uint16_t, - temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 1; - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i data2 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(data, coeff_01); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - conv_params->round_0); - - // Filter odd-index pixels - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - conv_params->round_0); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - const __m128i maxval = - _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - - (1 << (bd + conv_params->round_1 - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), conv_params->round_1); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), conv_params->round_1); - - const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); - __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storeu_si128(p, res_16bit); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c deleted file mode 100644 index 0c857b583..000000000 --- a/third_party/aom/av1/common/x86/intra_edge_sse4.c +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <assert.h> -#include <smmintrin.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { - if (!strength) return; - - DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { - { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 - { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 - { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 - }; - - DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { - { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, - { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - }; - - // Extend the first and last samples to simplify the loop for the 5-tap case - p[-1] = p[0]; - __m128i last = _mm_set1_epi8(p[sz - 1]); - _mm_storeu_si128((__m128i *)&p[sz], last); - - // Adjust input pointer for filter support area - uint8_t *in = (strength == 3) ? p - 1 : p; - - // Avoid modifying first sample - uint8_t *out = p + 1; - int len = sz - 1; - - const int use_3tap_filter = (strength < 3); - - if (use_3tap_filter) { - __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); - __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); - __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); - __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); - __m128i in0 = _mm_lddqu_si128((__m128i *)in); - while (len > 0) { - int n_out = (len < 8) ? len : 8; - __m128i d0 = _mm_shuffle_epi8(in0, shuf0); - __m128i d1 = _mm_shuffle_epi8(in0, shuf1); - d0 = _mm_maddubs_epi16(d0, coef0); - d1 = _mm_maddubs_epi16(d1, coef0); - d0 = _mm_hadd_epi16(d0, d1); - __m128i eight = _mm_set1_epi16(8); - d0 = _mm_add_epi16(d0, eight); - d0 = _mm_srai_epi16(d0, 4); - d0 = _mm_packus_epi16(d0, d0); - __m128i out0 = _mm_lddqu_si128((__m128i *)out); - __m128i n0 = _mm_set1_epi8(n_out); - __m128i mask = _mm_cmpgt_epi8(n0, iden); - out0 = _mm_blendv_epi8(out0, d0, mask); - _mm_storel_epi64((__m128i *)out, out0); - __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); - in0 = _mm_alignr_epi8(in1, in0, 8); - in += 8; - out += 8; - len -= n_out; - } - } else { // 5-tap filter - __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); - __m128i two = _mm_set1_epi8(2); - __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); - __m128i shuf_b = _mm_add_epi8(shuf_a, two); - __m128i shuf_c = _mm_add_epi8(shuf_b, two); - __m128i shuf_d = _mm_add_epi8(shuf_c, two); - __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); - __m128i in0 = _mm_lddqu_si128((__m128i *)in); - while (len > 0) { - int n_out = (len < 8) ? len : 8; - __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); - __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); - __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); - __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); - d0 = _mm_maddubs_epi16(d0, coef0); - d1 = _mm_maddubs_epi16(d1, coef0); - d2 = _mm_maddubs_epi16(d2, coef0); - d3 = _mm_maddubs_epi16(d3, coef0); - d0 = _mm_hadd_epi16(d0, d1); - d2 = _mm_hadd_epi16(d2, d3); - d0 = _mm_hadd_epi16(d0, d2); - __m128i eight = _mm_set1_epi16(8); - d0 = _mm_add_epi16(d0, eight); - d0 = _mm_srai_epi16(d0, 4); - d0 = _mm_packus_epi16(d0, d0); - __m128i out0 = _mm_lddqu_si128((__m128i *)out); - __m128i n0 = _mm_set1_epi8(n_out); - __m128i mask = _mm_cmpgt_epi8(n0, iden); - out0 = _mm_blendv_epi8(out0, d0, mask); - _mm_storel_epi64((__m128i *)out, out0); - __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); - in0 = _mm_alignr_epi8(in1, in0, 8); - in += 8; - out += 8; - len -= n_out; - } - } -} - -void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) { - if (!strength) return; - - DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { - { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 - { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 - { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 - }; - - DECLARE_ALIGNED(16, static const int16_t, - v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; - - // Extend the first and last samples to simplify the loop for the 5-tap case - p[-1] = p[0]; - __m128i last = _mm_set1_epi16(p[sz - 1]); - _mm_storeu_si128((__m128i *)&p[sz], last); - - // Adjust input pointer for filter support area - uint16_t *in = (strength == 3) ? p - 1 : p; - - // Avoid modifying first sample - uint16_t *out = p + 1; - int len = sz - 1; - - const int use_3tap_filter = (strength < 3); - - if (use_3tap_filter) { - __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); - __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); - __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); - __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); - while (len > 0) { - int n_out = (len < 8) ? len : 8; - __m128i in1 = _mm_alignr_epi8(in8, in0, 2); - __m128i in2 = _mm_alignr_epi8(in8, in0, 4); - __m128i in02 = _mm_add_epi16(in0, in2); - __m128i d0 = _mm_unpacklo_epi16(in02, in1); - __m128i d1 = _mm_unpackhi_epi16(in02, in1); - d0 = _mm_mullo_epi16(d0, coef0); - d1 = _mm_mullo_epi16(d1, coef0); - d0 = _mm_hadd_epi16(d0, d1); - __m128i eight = _mm_set1_epi16(8); - d0 = _mm_add_epi16(d0, eight); - d0 = _mm_srli_epi16(d0, 4); - __m128i out0 = _mm_lddqu_si128((__m128i *)out); - __m128i n0 = _mm_set1_epi16(n_out); - __m128i mask = _mm_cmpgt_epi16(n0, iden); - out0 = _mm_blendv_epi8(out0, d0, mask); - _mm_storeu_si128((__m128i *)out, out0); - in += 8; - in0 = in8; - in8 = _mm_lddqu_si128((__m128i *)&in[8]); - out += 8; - len -= n_out; - } - } else { // 5-tap filter - __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); - __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); - __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); - __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); - while (len > 0) { - int n_out = (len < 8) ? len : 8; - __m128i in1 = _mm_alignr_epi8(in8, in0, 2); - __m128i in2 = _mm_alignr_epi8(in8, in0, 4); - __m128i in3 = _mm_alignr_epi8(in8, in0, 6); - __m128i in4 = _mm_alignr_epi8(in8, in0, 8); - __m128i in04 = _mm_add_epi16(in0, in4); - __m128i in123 = _mm_add_epi16(in1, in2); - in123 = _mm_add_epi16(in123, in3); - __m128i d0 = _mm_unpacklo_epi16(in04, in123); - __m128i d1 = _mm_unpackhi_epi16(in04, in123); - d0 = _mm_mullo_epi16(d0, coef0); - d1 = _mm_mullo_epi16(d1, coef0); - d0 = _mm_hadd_epi16(d0, d1); - __m128i eight = _mm_set1_epi16(8); - d0 = _mm_add_epi16(d0, eight); - d0 = _mm_srli_epi16(d0, 4); - __m128i out0 = _mm_lddqu_si128((__m128i *)out); - __m128i n0 = _mm_set1_epi16(n_out); - __m128i mask = _mm_cmpgt_epi16(n0, iden); - out0 = _mm_blendv_epi8(out0, d0, mask); - _mm_storeu_si128((__m128i *)out, out0); - in += 8; - in0 = in8; - in8 = _mm_lddqu_si128((__m128i *)&in[8]); - out += 8; - len -= n_out; - } - } -} - -void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { - // interpolate half-sample positions - assert(sz <= 24); - - DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { - { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } - }; - - DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = { - { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, - { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } - }; - - // Extend first/last samples (upper-left p[-1], last p[sz-1]) - // to support 4-tap filter - p[-2] = p[-1]; - p[sz] = p[sz - 1]; - - uint8_t *in = &p[-2]; - uint8_t *out = &p[-2]; - - int n = sz + 1; // Input length including upper-left sample - - __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); - __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); - - __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); - __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); - __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); - - while (n > 0) { - __m128i in8 = _mm_alignr_epi8(in16, in0, 8); - __m128i d0 = _mm_shuffle_epi8(in0, shuf0); - __m128i d1 = _mm_shuffle_epi8(in0, shuf1); - __m128i d2 = _mm_shuffle_epi8(in8, shuf0); - __m128i d3 = _mm_shuffle_epi8(in8, shuf1); - d0 = _mm_maddubs_epi16(d0, coef0); - d1 = _mm_maddubs_epi16(d1, coef0); - d2 = _mm_maddubs_epi16(d2, coef0); - d3 = _mm_maddubs_epi16(d3, coef0); - d0 = _mm_hadd_epi16(d0, d1); - d2 = _mm_hadd_epi16(d2, d3); - __m128i eight = _mm_set1_epi16(8); - d0 = _mm_add_epi16(d0, eight); - d2 = _mm_add_epi16(d2, eight); - d0 = _mm_srai_epi16(d0, 4); - d2 = _mm_srai_epi16(d2, 4); - d0 = _mm_packus_epi16(d0, d2); - __m128i in1 = _mm_alignr_epi8(in16, in0, 1); - __m128i out0 = _mm_unpacklo_epi8(in1, d0); - __m128i out1 = _mm_unpackhi_epi8(in1, d0); - _mm_storeu_si128((__m128i *)&out[0], out0); - _mm_storeu_si128((__m128i *)&out[16], out1); - in0 = in16; - in16 = _mm_setzero_si128(); - out += 32; - n -= 16; - } -} - -void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) { - // interpolate half-sample positions - assert(sz <= 24); - - DECLARE_ALIGNED(16, static const int16_t, - kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; - - // Extend first/last samples (upper-left p[-1], last p[sz-1]) - // to support 4-tap filter - p[-2] = p[-1]; - p[sz] = p[sz - 1]; - - uint16_t *in = &p[-2]; - uint16_t *out = in; - int n = sz + 1; - - __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); - __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); - __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); - __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); - - while (n > 0) { - __m128i in1 = _mm_alignr_epi8(in8, in0, 2); - __m128i in2 = _mm_alignr_epi8(in8, in0, 4); - __m128i in3 = _mm_alignr_epi8(in8, in0, 6); - __m128i sum0 = _mm_add_epi16(in0, in3); - __m128i sum1 = _mm_add_epi16(in1, in2); - __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); - __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); - __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); - d0 = _mm_madd_epi16(d0, coef0); - d1 = _mm_madd_epi16(d1, coef0); - __m128i eight = _mm_set1_epi32(8); - d0 = _mm_add_epi32(d0, eight); - d1 = _mm_add_epi32(d1, eight); - d0 = _mm_srai_epi32(d0, 4); - d1 = _mm_srai_epi32(d1, 4); - d0 = _mm_packus_epi32(d0, d1); - __m128i max0 = _mm_set1_epi16((1 << bd) - 1); - d0 = _mm_min_epi16(d0, max0); - __m128i out0 = _mm_unpacklo_epi16(in1, d0); - __m128i out1 = _mm_unpackhi_epi16(in1, d0); - _mm_storeu_si128((__m128i *)&out[0], out0); - _mm_storeu_si128((__m128i *)&out[8], out1); - in0 = in8; - in8 = in16; - in16 = in24; - in24 = _mm_setzero_si128(); - out += 16; - n -= 8; - } -} diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c deleted file mode 100644 index 9f2e2b457..000000000 --- a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/x86/convolve_avx2.h" -#include "aom_dsp/x86/convolve_common_intrin.h" -#include "aom_dsp/x86/convolve_sse4_1.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "av1/common/convolve.h" - -static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m256i wt0 = _mm256_set1_epi16(w0); - const __m256i wt1 = _mm256_set1_epi16(w1); - const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); - return wt; -} - -static INLINE __m256i load_line2_avx2(const void *a, const void *b) { - return _mm256_permute2x128_si256( - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); -} - -void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_1; - const __m256i wt = unpack_weights_avx2(conv_params); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], coeffs[4]; - - assert(bits >= 0); - assert(conv_params->round_0 > 0); - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - - const __m256i round_const = - _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); - - (void)filter_params_y; - (void)subpel_y_q4; - - for (i = 0; i < h; i += 2) { - const uint8_t *src_data = src_ptr + i * src_stride; - CONV_BUF_TYPE *dst_data = dst + i * dst_stride; - for (j = 0; j < w; j += 8) { - const __m256i data = - load_line2_avx2(&src_data[j], &src_data[j + src_stride]); - - __m256i res = convolve_lowbd_x(data, coeffs, filt); - - res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); - - res = _mm256_slli_epi16(res, bits); - - const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m256i data_ref_0 = - load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - if (w > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); - } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); - } - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - } -} - -void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; - // +1 to compensate for dividing the filter coeffs by 2 - const int left_shift = FILTER_BITS - conv_params->round_0 + 1; - const __m256i round_const = - _mm256_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - const __m256i wt = unpack_weights_avx2(conv_params); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi16(offset); - const int offset_1 = (1 << (bd + FILTER_BITS - 2)); - const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); - const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - const __m256i zero = _mm256_setzero_si256(); - __m256i coeffs[4], s[8]; - - assert((FILTER_BITS - conv_params->round_0) >= 0); - - prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); - - (void)conv_params; - (void)filter_params_x; - (void)subpel_x_q4; - - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - // Load lines a and b. Line a to lower 128, line b to upper 128 - { - __m256i src_ab[7]; - __m256i src_a[7]; - src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - for (int kk = 0; kk < 6; ++kk) { - data += src_stride; - src_a[kk + 1] = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); - } - src6 = src_a[6]; - s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); - s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); - s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); - s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); - s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); - s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); - } - - for (i = 0; i < h; i += 2) { - data = &src_ptr[(i + 7) * src_stride + j]; - const __m256i src7 = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); - - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); - - __m256i res_lo = convolve_lowbd(s, coeffs); - - res_lo = _mm256_add_epi16(res_lo, offset_const_1); - - const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); - const __m256i res_lo_0_shift = - _mm256_slli_epi32(res_lo_0_32b, left_shift); - const __m256i res_lo_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); - - const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); - const __m256i res_lo_1_shift = - _mm256_slli_epi32(res_lo_1_32b, left_shift); - const __m256i res_lo_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); - - const __m256i res_lo_round = - _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); - - const __m256i res_lo_unsigned = - _mm256_add_epi16(res_lo_round, offset_const_2); - - if (w - j < 16) { - if (do_average) { - const __m256i data_ref_0 = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - if (w - j > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); - } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); - } - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - - const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - __m256i res_hi = convolve_lowbd(s + 4, coeffs); - - res_hi = _mm256_add_epi16(res_hi, offset_const_1); - - const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); - const __m256i res_hi_0_shift = - _mm256_slli_epi32(res_hi_0_32b, left_shift); - const __m256i res_hi_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); - - const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); - const __m256i res_hi_1_shift = - _mm256_slli_epi32(res_hi_1_32b, left_shift); - const __m256i res_hi_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); - - const __m256i res_hi_round = - _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); - - const __m256i res_hi_unsigned = - _mm256_add_epi16(res_hi_round, offset_const_2); - - if (do_average) { - const __m256i data_ref_0_lo = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - - const __m256i data_ref_0_hi = - load_line2_avx2(&dst[i * dst_stride + j + 8], - &dst[i * dst_stride + j + 8 + dst_stride]); - - const __m256i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg); - - const __m256i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result_lo = convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - - const __m256i round_result_hi = convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = - _mm256_packus_epi16(round_result_lo, round_result_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); - - } else { - const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); - - const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_lo_1); - - const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); - - const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); - _mm_store_si128( - (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); - } - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } -} - -void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - const __m256i wt = unpack_weights_avx2(conv_params); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4]; - - assert(conv_params->round_0 > 0); - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - - const __m256i round_const_h = _mm256_set1_epi16( - ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); - const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); - - const __m256i round_const_v = _mm256_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); - - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { - const uint8_t *src_h = src_ptr + j; - for (i = 0; i < im_h; i += 2) { - __m256i data = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); - if (i + 1 < im_h) - data = _mm256_inserti128_si256( - data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); - src_h += (src_stride << 1); - __m256i res = convolve_lowbd_x(data, coeffs_x, filt); - - res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), - round_shift_h); - - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - } - - /* Vertical filter */ - { - __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); - __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); - __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); - __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - - s[0] = _mm256_unpacklo_epi16(s0, s1); - s[1] = _mm256_unpacklo_epi16(s2, s3); - s[2] = _mm256_unpacklo_epi16(s4, s5); - - s[4] = _mm256_unpackhi_epi16(s0, s1); - s[5] = _mm256_unpackhi_epi16(s2, s3); - s[6] = _mm256_unpackhi_epi16(s4, s5); - - for (i = 0; i < h; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); - - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); - - const __m256i res_a = convolve(s, coeffs_y); - const __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_v), round_shift_v); - - if (w - j > 4) { - const __m256i res_b = convolve(s + 4, coeffs_y); - const __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_v), round_shift_v); - const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); - const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); - - if (do_average) { - const __m256i data_ref_0 = - load_line2_avx2(&dst[i * dst_stride + j], - &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = - _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } else { - const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); - const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); - - if (do_average) { - const __m256i data_ref_0 = - load_line2_avx2(&dst[i * dst_stride + j], - &dst[i * dst_stride + j + dst_stride]); - - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = - _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); - - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - } - } -} - -void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - (void)filter_params_x; - (void)filter_params_y; - (void)subpel_x_q4; - (void)subpel_y_q4; - - const int bits = - FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; - const __m128i left_shift = _mm_cvtsi32_si128(bits); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const __m256i wt = unpack_weights_avx2(conv_params); - const __m256i zero = _mm256_setzero_si256(); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m256i offset_const = _mm256_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - int i, j; - - if (!(w % 16)) { - for (i = 0; i < h; i += 1) { - for (j = 0; j < w; j += 16) { - const __m256i src_16bit = _mm256_cvtepu8_epi16( - _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]))); - - const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); - const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); - - if (do_average) { - const __m256i data_ref_0 = - _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); - - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8); - - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), - _mm256_castsi256_si128(res_0)); - } else { - _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), - res_unsigned); - } - } - } - } else if (!(w % 4)) { - for (i = 0; i < h; i += 2) { - for (j = 0; j < w; j += 8) { - const __m128i src_row_0 = - _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); - const __m128i src_row_1 = - _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); - // since not all compilers yet support _mm256_set_m128i() - const __m256i src_10 = _mm256_insertf128_si256( - _mm256_castsi128_si256(src_row_0), src_row_1, 1); - - const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero); - - const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); - - const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m256i data_ref_0 = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - - if (w > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); - } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); - } - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c deleted file mode 100644 index 87dc3242e..000000000 --- a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_sse2.h" - -void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - CONV_BUF_TYPE *dst = conv_params->dst; - const int dst_stride = conv_params->dst_stride; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *src_ptr = src - fo_horiz; - const int bits = FILTER_BITS - conv_params->round_1; - const __m128i left_shift = _mm_cvtsi32_si128(bits); - const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - __m128i coeffs[4]; - - (void)filter_params_y; - (void)subpel_y_q4; - - prepare_coeffs(filter_params_x, subpel_x_q4, coeffs); - - if (w == 4) { - do { - const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); - __m128i s[4]; - - s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); - s[1] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); - s[2] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); - s[3] = - _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); - const __m128i res_lo = convolve_lo_x(s, coeffs); - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); - - const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift); - const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); - } else { - _mm_store_si128((__m128i *)(&dst[0]), res_unsigned); - } - src_ptr += src_stride; - dst += dst_stride; - dst0 += dst_stride0; - } while (--h); - } else { - assert(!(w % 8)); - int i = 0; - do { - int j = 0; - do { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - __m128i s[4]; - - // Filter even-index pixels - s[0] = data; - s[1] = _mm_srli_si128(data, 2); - s[2] = _mm_srli_si128(data, 4); - s[3] = _mm_srli_si128(data, 6); - const __m128i res_even = convolve_lo_x(s, coeffs); - - // Filter odd-index pixels - s[0] = _mm_srli_si128(data, 1); - s[1] = _mm_srli_si128(data, 3); - s[2] = _mm_srli_si128(data, 5); - s[3] = _mm_srli_si128(data, 7); - const __m128i res_odd = convolve_lo_x(s, coeffs); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); - const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift); - - const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); - const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - j += 8; - } while (j < w); - } while (++i < h); - } -} - -void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - const int bd = 8; - CONV_BUF_TYPE *dst = conv_params->dst; - const int dst_stride = conv_params->dst_stride; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *src_ptr = src - fo_vert * src_stride; - const int bits = FILTER_BITS - conv_params->round_0; - const __m128i left_shift = _mm_cvtsi32_si128(bits); - const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset); - const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - __m128i coeffs[4]; - - (void)filter_params_x; - (void)subpel_x_q4; - - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs); - - if (w == 4) { - __m128i s[8], src6, res, res_shift; - src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)), - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6); - - do { - s[6] = _mm_unpacklo_epi8( - src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride))); - src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6); - - res = convolve_lo_y(s + 0, coeffs); - res_shift = _mm_sll_epi32(res, left_shift); - res_shift = - _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); - - __m128i res_16b = _mm_packs_epi32(res_shift, res_shift); - __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); - - } else { - _mm_store_si128((__m128i *)dst, res_unsigned); - } - - src_ptr += src_stride; - dst += dst_stride; - dst0 += dst_stride0; - - res = convolve_lo_y(s + 1, coeffs); - res_shift = _mm_sll_epi32(res, left_shift); - res_shift = - _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); - - res_16b = _mm_packs_epi32(res_shift, res_shift); - res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); - - } else { - _mm_store_si128((__m128i *)dst, res_unsigned); - } - - src_ptr += src_stride; - dst += dst_stride; - dst0 += dst_stride0; - - s[0] = s[2]; - s[1] = s[3]; - s[2] = s[4]; - s[3] = s[5]; - s[4] = s[6]; - s[5] = s[7]; - h -= 2; - } while (h); - } else { - assert(!(w % 8)); - int j = 0; - do { - __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift; - const uint8_t *data = &src_ptr[j]; - - src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); - s[0] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); - s[1] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); - s[2] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); - s[3] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); - s[4] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), - _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); - s[5] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); - - int i = 0; - do { - data = &src_ptr[i * src_stride + j]; - s[6] = _mm_unpacklo_epi8( - src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); - src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); - s[7] = _mm_unpacklo_epi8( - _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); - - res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels - res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels - res_lo_shift = _mm_sll_epi32(res_lo, left_shift); - res_hi_shift = _mm_sll_epi32(res_hi, left_shift); - res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), - round_shift); - res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), - round_shift); - - __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); - __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - i++; - - res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels - res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels - res_lo_shift = _mm_sll_epi32(res_lo, left_shift); - res_hi_shift = _mm_sll_epi32(res_hi, left_shift); - res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), - round_shift); - res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), - round_shift); - res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); - res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - i++; - - s[0] = s[2]; - s[1] = s[3]; - s[2] = s[4]; - s[3] = s[5]; - s[4] = s[6]; - s[5] = s[7]; - } while (i < h); - j += 8; - } while (j < w); - } -} diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c deleted file mode 100644 index 822772782..000000000 --- a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/aom_dsp_rtcd.h" - -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_sse2.h" - -void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_q4 & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - const __m128i src_lo = _mm_unpacklo_epi8(data, zero); - const __m128i src_hi = _mm_unpackhi_epi8(data, zero); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); - const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_q4 & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); - const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - - if (w > 4) - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - else - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c deleted file mode 100644 index f645e0454..000000000 --- a/third_party/aom/av1/common/x86/reconinter_avx2.c +++ /dev/null @@ -1,620 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/blend.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" -#include "av1/common/blockd.h" - -static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, - const __m256i s1) { - const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); - return _mm256_abs_epi16( - _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); - // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) -} -void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, - DIFFWTD_MASK_TYPE mask_type, - const uint8_t *src0, int stride0, - const uint8_t *src1, int stride1, - int h, int w) { - const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; - const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); - int i = 0; - if (4 == w) { - do { - const __m128i s0A = xx_loadl_32(src0); - const __m128i s0B = xx_loadl_32(src0 + stride0); - const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); - const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); - const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); - const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); - const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); - - const __m128i s1A = xx_loadl_32(src1); - const __m128i s1B = xx_loadl_32(src1 + stride1); - const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); - const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); - const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); - const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); - const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); - const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); - const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); - const __m128i x_m8 = - _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); - xx_storeu_128(mask, x_m8); - src0 += (stride0 << 2); - src1 += (stride1 << 2); - mask += 16; - i += 4; - } while (i < h); - } else if (8 == w) { - do { - const __m128i s0A = xx_loadl_64(src0); - const __m128i s0B = xx_loadl_64(src0 + stride0); - const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); - const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); - const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); - const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); - const __m128i s1A = xx_loadl_64(src1); - const __m128i s1B = xx_loadl_64(src1 + stride1); - const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); - const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); - const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); - const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); - const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); - const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); - const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); - yy_storeu_256(mask, m8); - src0 += stride0 << 2; - src1 += stride1 << 2; - mask += 32; - i += 4; - } while (i < h); - } else if (16 == w) { - do { - const __m128i s0A = xx_load_128(src0); - const __m128i s0B = xx_load_128(src0 + stride0); - const __m128i s1A = xx_load_128(src1); - const __m128i s1B = xx_load_128(src1 + stride1); - const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); - const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); - const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); - const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); - - const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); - const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); - - const __m256i m8 = - _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); - yy_storeu_256(mask, m8); - src0 += stride0 << 1; - src1 += stride1 << 1; - mask += 32; - i += 2; - } while (i < h); - } else { - do { - int j = 0; - do { - const __m256i s0 = yy_loadu_256(src0 + j); - const __m256i s1 = yy_loadu_256(src1 + j); - const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); - const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); - const __m256i s0H = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); - const __m256i s1H = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); - const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); - const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); - const __m256i m8 = - _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); - yy_storeu_256(mask + j, m8); - j += 32; - } while (j < w); - src0 += stride0; - src1 += stride1; - mask += w; - i += 1; - } while (i < h); - } -} - -static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, - const __m256i *data_src1, - const __m256i *round_const, - const __m256i *mask_base_16, - const __m256i *clip_diff, int round) { - const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); - const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); - const __m256i diff = _mm256_max_epu16(diffa, diffb); - const __m256i diff_round = - _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); - const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); - const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); - const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); - return diff_clamp; -} - -static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, - const __m256i *data_src1, - const __m256i *round_const, - const __m256i *mask_base_16, - const __m256i *clip_diff, - int round) { - const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); - const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); - const __m256i diff = _mm256_max_epu16(diffa, diffb); - const __m256i diff_round = - _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); - const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); - const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); - const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); - const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); - return diff_const_16; -} - -static INLINE void build_compound_diffwtd_mask_d16_avx2( - uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, - const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { - const int mask_base = 38; - const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); - const __m256i y38 = _mm256_set1_epi16(mask_base); - const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - int i = 0; - if (w == 4) { - do { - const __m128i s0A = xx_loadl_64(src0); - const __m128i s0B = xx_loadl_64(src0 + src0_stride); - const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); - const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); - const __m128i s1A = xx_loadl_64(src1); - const __m128i s1B = xx_loadl_64(src1 + src1_stride); - const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); - const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); - const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), - _mm_unpacklo_epi64(s0A, s0B)); - const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), - _mm_unpacklo_epi64(s1A, s1B)); - const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); - xx_storeu_128(mask, - _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); - src0 += src0_stride << 2; - src1 += src1_stride << 2; - mask += 16; - i += 4; - } while (i < h); - } else if (w == 8) { - do { - const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); - const __m256i s0CD = - yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); - const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); - const __m256i s1CD = - yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); - const __m256i m16AB = - calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); - const __m256i m16CD = - calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride << 2; - src1 += src1_stride << 2; - mask += 32; - i += 4; - } while (i < h); - } else if (w == 16) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + src0_stride); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + src1_stride); - const __m256i m16A = - calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16A, m16B); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride << 1; - src1 += src1_stride << 1; - mask += 32; - i += 2; - } while (i < h); - } else if (w == 32) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i m16A = - calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16A, m16B); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 32; - i += 1; - } while (i < h); - } else if (w == 64) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s0C = yy_loadu_256(src0 + 32); - const __m256i s0D = yy_loadu_256(src0 + 48); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i s1C = yy_loadu_256(src1 + 32); - const __m256i s1D = yy_loadu_256(src1 + 48); - const __m256i m16A = - calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m16C = - calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); - const __m256i m16D = - calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); - const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); - const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); - yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 64; - i += 1; - } while (i < h); - } else { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s0C = yy_loadu_256(src0 + 32); - const __m256i s0D = yy_loadu_256(src0 + 48); - const __m256i s0E = yy_loadu_256(src0 + 64); - const __m256i s0F = yy_loadu_256(src0 + 80); - const __m256i s0G = yy_loadu_256(src0 + 96); - const __m256i s0H = yy_loadu_256(src0 + 112); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i s1C = yy_loadu_256(src1 + 32); - const __m256i s1D = yy_loadu_256(src1 + 48); - const __m256i s1E = yy_loadu_256(src1 + 64); - const __m256i s1F = yy_loadu_256(src1 + 80); - const __m256i s1G = yy_loadu_256(src1 + 96); - const __m256i s1H = yy_loadu_256(src1 + 112); - const __m256i m16A = - calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m16C = - calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); - const __m256i m16D = - calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); - const __m256i m16E = - calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); - const __m256i m16F = - calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); - const __m256i m16G = - calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); - const __m256i m16H = - calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); - const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); - const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); - const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); - const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); - yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); - yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); - yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 128; - i += 1; - } while (i < h); - } -} - -static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( - uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, - const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { - const int mask_base = 38; - const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); - const __m256i y38 = _mm256_set1_epi16(mask_base); - const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - int i = 0; - if (w == 4) { - do { - const __m128i s0A = xx_loadl_64(src0); - const __m128i s0B = xx_loadl_64(src0 + src0_stride); - const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); - const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); - const __m128i s1A = xx_loadl_64(src1); - const __m128i s1B = xx_loadl_64(src1 + src1_stride); - const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); - const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); - const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), - _mm_unpacklo_epi64(s0A, s0B)); - const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), - _mm_unpacklo_epi64(s1A, s1B)); - const __m256i m16 = - calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); - xx_storeu_128(mask, - _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); - src0 += src0_stride << 2; - src1 += src1_stride << 2; - mask += 16; - i += 4; - } while (i < h); - } else if (w == 8) { - do { - const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); - const __m256i s0CD = - yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); - const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); - const __m256i s1CD = - yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); - const __m256i m16AB = - calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); - const __m256i m16CD = - calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride << 2; - src1 += src1_stride << 2; - mask += 32; - i += 4; - } while (i < h); - } else if (w == 16) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + src0_stride); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + src1_stride); - const __m256i m16A = - calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16A, m16B); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride << 1; - src1 += src1_stride << 1; - mask += 32; - i += 2; - } while (i < h); - } else if (w == 32) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i m16A = - calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m8 = _mm256_packus_epi16(m16A, m16B); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 32; - i += 1; - } while (i < h); - } else if (w == 64) { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s0C = yy_loadu_256(src0 + 32); - const __m256i s0D = yy_loadu_256(src0 + 48); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i s1C = yy_loadu_256(src1 + 32); - const __m256i s1D = yy_loadu_256(src1 + 48); - const __m256i m16A = - calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m16C = - calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); - const __m256i m16D = - calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); - const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); - const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); - yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 64; - i += 1; - } while (i < h); - } else { - do { - const __m256i s0A = yy_loadu_256(src0); - const __m256i s0B = yy_loadu_256(src0 + 16); - const __m256i s0C = yy_loadu_256(src0 + 32); - const __m256i s0D = yy_loadu_256(src0 + 48); - const __m256i s0E = yy_loadu_256(src0 + 64); - const __m256i s0F = yy_loadu_256(src0 + 80); - const __m256i s0G = yy_loadu_256(src0 + 96); - const __m256i s0H = yy_loadu_256(src0 + 112); - const __m256i s1A = yy_loadu_256(src1); - const __m256i s1B = yy_loadu_256(src1 + 16); - const __m256i s1C = yy_loadu_256(src1 + 32); - const __m256i s1D = yy_loadu_256(src1 + 48); - const __m256i s1E = yy_loadu_256(src1 + 64); - const __m256i s1F = yy_loadu_256(src1 + 80); - const __m256i s1G = yy_loadu_256(src1 + 96); - const __m256i s1H = yy_loadu_256(src1 + 112); - const __m256i m16A = - calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); - const __m256i m16B = - calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); - const __m256i m16C = - calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); - const __m256i m16D = - calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); - const __m256i m16E = - calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); - const __m256i m16F = - calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); - const __m256i m16G = - calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); - const __m256i m16H = - calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); - const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); - const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); - const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); - const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); - yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); - yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); - yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); - yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); - src0 += src0_stride; - src1 += src1_stride; - mask += 128; - i += 1; - } while (i < h); - } -} - -void av1_build_compound_diffwtd_mask_d16_avx2( - uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, - int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, - ConvolveParams *conv_params, int bd) { - const int shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); - // When rounding constant is added, there is a possibility of overflow. - // However that much precision is not required. Code should very well work for - // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But - // there is a possibility of corner case bugs. - assert(DIFF_FACTOR_LOG2 == 4); - assert(AOM_BLEND_A64_MAX_ALPHA == 64); - - if (mask_type == DIFFWTD_38) { - build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, - src1_stride, h, w, shift); - } else { - build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, - src1_stride, h, w, shift); - } -} - -void av1_build_compound_diffwtd_mask_highbd_avx2( - uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, - int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, - int bd) { - if (w < 16) { - av1_build_compound_diffwtd_mask_highbd_ssse3( - mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); - } else { - assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); - assert(bd >= 8); - assert((w % 16) == 0); - const __m256i y0 = _mm256_setzero_si256(); - const __m256i yAOM_BLEND_A64_MAX_ALPHA = - _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const int mask_base = 38; - const __m256i ymask_base = _mm256_set1_epi16(mask_base); - const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); - const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); - if (bd == 8) { - if (mask_type == DIFFWTD_38_INV) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); - __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); - __m256i diff = _mm256_srai_epi16( - _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); - __m256i m = _mm256_min_epi16( - _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), - yAOM_BLEND_A64_MAX_ALPHA); - m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); - m = _mm256_packus_epi16(m, m); - m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); - __m128i m0 = _mm256_castsi256_si128(m); - _mm_storeu_si128((__m128i *)&mask[j], m0); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); - __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); - __m256i diff = _mm256_srai_epi16( - _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); - __m256i m = _mm256_min_epi16( - _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), - yAOM_BLEND_A64_MAX_ALPHA); - m = _mm256_packus_epi16(m, m); - m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); - __m128i m0 = _mm256_castsi256_si128(m); - _mm_storeu_si128((__m128i *)&mask[j], m0); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } - } else { - const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); - if (mask_type == DIFFWTD_38_INV) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); - __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); - __m256i diff = _mm256_sra_epi16( - _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); - __m256i m = _mm256_min_epi16( - _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), - yAOM_BLEND_A64_MAX_ALPHA); - m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); - m = _mm256_packus_epi16(m, m); - m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); - __m128i m0 = _mm256_castsi256_si128(m); - _mm_storeu_si128((__m128i *)&mask[j], m0); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); - __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); - __m256i diff = _mm256_sra_epi16( - _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); - __m256i m = _mm256_min_epi16( - _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), - yAOM_BLEND_A64_MAX_ALPHA); - m = _mm256_packus_epi16(m, m); - m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); - __m128i m0 = _mm256_castsi256_si128(m); - _mm_storeu_si128((__m128i *)&mask[j], m0); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c deleted file mode 100644 index 5171ca493..000000000 --- a/third_party/aom/av1/common/x86/reconinter_sse4.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> // SSE2 -#include <smmintrin.h> /* SSE4.1 */ - -#include "aom/aom_integer.h" -#include "aom_dsp/blend.h" -#include "av1/common/blockd.h" - -static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, - const __m128i s1) { - const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1)); - return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4))); - // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) -} - -void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, - DIFFWTD_MASK_TYPE mask_type, - const uint8_t *src0, int stride0, - const uint8_t *src1, int stride1, - int h, int w) { - const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; - const __m128i mask_base = _mm_set1_epi16(38 - mb); - int i = 0; - if (4 == w) { - do { - const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0); - const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0)); - const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); - const __m128i s0 = _mm_cvtepu8_epi16(s0AB); - - const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1); - const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1)); - const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); - const __m128i s1 = _mm_cvtepu8_epi16(s1AB); - - const __m128i m16 = calc_mask(mask_base, s0, s1); - const __m128i m8 = _mm_packus_epi16(m16, m16); - - *(uint32_t *)mask = _mm_cvtsi128_si32(m8); - *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1); - src0 += (stride0 << 1); - src1 += (stride1 << 1); - mask += 8; - i += 2; - } while (i < h); - } else if (8 == w) { - do { - __m128i s0 = _mm_loadl_epi64((__m128i const *)src0); - __m128i s1 = _mm_loadl_epi64((__m128i const *)src1); - s0 = _mm_cvtepu8_epi16(s0); - s1 = _mm_cvtepu8_epi16(s1); - const __m128i m16 = calc_mask(mask_base, s0, s1); - const __m128i m8 = _mm_packus_epi16(m16, m16); - _mm_storel_epi64((__m128i *)mask, m8); - src0 += stride0; - src1 += stride1; - mask += 8; - i += 1; - } while (i < h); - } else { - const __m128i zero = _mm_setzero_si128(); - do { - int j = 0; - do { - const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j)); - const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j)); - const __m128i s0L = _mm_cvtepu8_epi16(s0); - const __m128i s1L = _mm_cvtepu8_epi16(s1); - const __m128i s0H = _mm_unpackhi_epi8(s0, zero); - const __m128i s1H = _mm_unpackhi_epi8(s1, zero); - - const __m128i m16L = calc_mask(mask_base, s0L, s1L); - const __m128i m16H = calc_mask(mask_base, s0H, s1H); - - const __m128i m8 = _mm_packus_epi16(m16L, m16H); - _mm_store_si128((__m128i *)(mask + j), m8); - j += 16; - } while (j < w); - src0 += stride0; - src1 += stride1; - mask += w; - i += 1; - } while (i < h); - } -} - -void av1_build_compound_diffwtd_mask_d16_sse4_1( - uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, - int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, - ConvolveParams *conv_params, int bd) { - const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1; - const int mask_base = 38; - int round = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); - const __m128i round_const = _mm_set1_epi16((1 << round) >> 1); - const __m128i mask_base_16 = _mm_set1_epi16(mask_base); - const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i add_const = - _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0)); - const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1)); - - int i, j; - // When rounding constant is added, there is a possibility of overflow. - // However that much precision is not required. Code should very well work for - // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But - // there is a possibility of corner case bugs. - assert(DIFF_FACTOR_LOG2 == 4); - assert(AOM_BLEND_A64_MAX_ALPHA == 64); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data_src0 = - _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]); - const __m128i data_src1 = - _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]); - - const __m128i diffa = _mm_subs_epu16(data_src0, data_src1); - const __m128i diffb = _mm_subs_epu16(data_src1, data_src0); - const __m128i diff = _mm_max_epu16(diffa, diffb); - const __m128i diff_round = - _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round); - const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2); - const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16); - __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff); - // clamp to 0 can be skipped since we are using add and saturate - // instruction - - const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign); - const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const); - - // 8 bit conversion and saturation to uint8 - const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16); - - // Store values into the destination buffer - __m128i *const dst = (__m128i *)&mask[i * w + j]; - - if ((w - j) > 4) { - _mm_storel_epi64(dst, res_8); - } else { // w==4 - *(uint32_t *)dst = _mm_cvtsi128_si32(res_8); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c deleted file mode 100644 index cf684447c..000000000 --- a/third_party/aom/av1/common/x86/reconinter_ssse3.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <tmmintrin.h> - -#include "config/av1_rtcd.h" - -#include "aom/aom_integer.h" -#include "aom_dsp/blend.h" -#include "aom_dsp/x86/synonyms.h" -#include "av1/common/blockd.h" - -void av1_build_compound_diffwtd_mask_highbd_ssse3( - uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, - int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, - int bd) { - if (w < 8) { - av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, - src1, src1_stride, h, w, bd); - } else { - assert(bd >= 8); - assert((w % 8) == 0); - assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); - const __m128i x0 = _mm_setzero_si128(); - const __m128i xAOM_BLEND_A64_MAX_ALPHA = - _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const int mask_base = 38; - const __m128i xmask_base = _mm_set1_epi16(mask_base); - const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); - const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); - if (bd == 8) { - if (mask_type == DIFFWTD_38_INV) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); - __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); - __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), - DIFF_FACTOR_LOG2); - __m128i m = _mm_min_epi16( - _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), - xAOM_BLEND_A64_MAX_ALPHA); - m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); - m = _mm_packus_epi16(m, m); - _mm_storel_epi64((__m128i *)&mask[j], m); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); - __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); - __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), - DIFF_FACTOR_LOG2); - __m128i m = _mm_min_epi16( - _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), - xAOM_BLEND_A64_MAX_ALPHA); - m = _mm_packus_epi16(m, m); - _mm_storel_epi64((__m128i *)&mask[j], m); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } - } else { - const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); - if (mask_type == DIFFWTD_38_INV) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); - __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); - __m128i diff = - _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); - __m128i m = _mm_min_epi16( - _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), - xAOM_BLEND_A64_MAX_ALPHA); - m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); - m = _mm_packus_epi16(m, m); - _mm_storel_epi64((__m128i *)&mask[j], m); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); - __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); - __m128i diff = - _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); - __m128i m = _mm_min_epi16( - _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), - xAOM_BLEND_A64_MAX_ALPHA); - m = _mm_packus_epi16(m, m); - _mm_storel_epi64((__m128i *)&mask[j], m); - } - ssrc0 += src0_stride; - ssrc1 += src1_stride; - mask += w; - } - } - } - } -} diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c deleted file mode 100644 index 0aaf1f454..000000000 --- a/third_party/aom/av1/common/x86/selfguided_avx2.c +++ /dev/null @@ -1,724 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "av1/common/restoration.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" - -// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to -// 32-bit precision and return them in an AVX2 register. -static __m256i yy256_load_extend_8_32(const void *p) { - return _mm256_cvtepu8_epi32(xx_loadl_64(p)); -} - -// Load 8 halfwords from the possibly-misaligned pointer p, extend each -// halfword to 32-bit precision and return them in an AVX2 register. -static __m256i yy256_load_extend_16_32(const void *p) { - return _mm256_cvtepu16_epi32(xx_loadu_128(p)); -} - -// Compute the scan of an AVX2 register holding 8 32-bit integers. If the -// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., -// x0+x1+...+x7 -// -// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers -// (assumed small enough to be able to add them without overflow). -// -// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. -// -// x = [h g f e][d c b a] -// x01 = [g f e 0][c b a 0] -// x02 = [g+h f+g e+f e][c+d b+c a+b a] -// x03 = [e+f e 0 0][a+b a 0 0] -// x04 = [e->h e->g e->f e][a->d a->c a->b a] -// s = a->d -// s01 = [a->d a->d a->d a->d] -// s02 = [a->d a->d a->d a->d][0 0 0 0] -// ret = [a->h a->g a->f a->e][a->d a->c a->b a] -static __m256i scan_32(__m256i x) { - const __m256i x01 = _mm256_slli_si256(x, 4); - const __m256i x02 = _mm256_add_epi32(x, x01); - const __m256i x03 = _mm256_slli_si256(x02, 8); - const __m256i x04 = _mm256_add_epi32(x02, x03); - const int32_t s = _mm256_extract_epi32(x04, 3); - const __m128i s01 = _mm_set1_epi32(s); - const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); - return _mm256_add_epi32(x04, s02); -} - -// Compute two integral images from src. B sums elements; A sums their -// squares. The images are offset by one pixel, so will have width and height -// equal to width + 1, height + 1 and the first row and column will be zero. -// -// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple -// of 8. - -static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { - unsigned int i = 0; - for (i = 0; i < (count & 0xffffffe0); i += 32) { - _mm256_storeu_si256((__m256i *)(dest + i), *zero); - _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); - _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); - _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); - } - for (; i < (count & 0xfffffff8); i += 8) { - _mm256_storeu_si256((__m256i *)(dest + i), *zero); - } - for (; i < count; i++) { - dest[i] = 0; - } - return dest; -} - -static void integral_images(const uint8_t *src, int src_stride, int width, - int height, int32_t *A, int32_t *B, - int buf_stride) { - const __m256i zero = _mm256_setzero_si256(); - // Write out the zero top row - memset_zero_avx(A, &zero, (width + 8)); - memset_zero_avx(B, &zero, (width + 8)); - for (int i = 0; i < height; ++i) { - // Zero the left column. - A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; - - // ldiff is the difference H - D where H is the output sample immediately - // to the left and D is the output sample above it. These are scalars, - // replicated across the eight lanes. - __m256i ldiff1 = zero, ldiff2 = zero; - for (int j = 0; j < width; j += 8) { - const int ABj = 1 + j; - - const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); - const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); - - const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); - const __m256i x2 = _mm256_madd_epi16(x1, x1); - - const __m256i sc1 = scan_32(x1); - const __m256i sc2 = scan_32(x2); - - const __m256i row1 = - _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); - const __m256i row2 = - _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); - - yy_store_256(B + ABj + (i + 1) * buf_stride, row1); - yy_store_256(A + ABj + (i + 1) * buf_stride, row2); - - // Calculate the new H - D. - ldiff1 = _mm256_set1_epi32( - _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); - ldiff2 = _mm256_set1_epi32( - _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); - } - } -} - -// Compute two integral images from src. B sums elements; A sums their squares -// -// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. -static void integral_images_highbd(const uint16_t *src, int src_stride, - int width, int height, int32_t *A, - int32_t *B, int buf_stride) { - const __m256i zero = _mm256_setzero_si256(); - // Write out the zero top row - memset_zero_avx(A, &zero, (width + 8)); - memset_zero_avx(B, &zero, (width + 8)); - - for (int i = 0; i < height; ++i) { - // Zero the left column. - A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; - - // ldiff is the difference H - D where H is the output sample immediately - // to the left and D is the output sample above it. These are scalars, - // replicated across the eight lanes. - __m256i ldiff1 = zero, ldiff2 = zero; - for (int j = 0; j < width; j += 8) { - const int ABj = 1 + j; - - const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); - const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); - - const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); - const __m256i x2 = _mm256_madd_epi16(x1, x1); - - const __m256i sc1 = scan_32(x1); - const __m256i sc2 = scan_32(x2); - - const __m256i row1 = - _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); - const __m256i row2 = - _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); - - yy_store_256(B + ABj + (i + 1) * buf_stride, row1); - yy_store_256(A + ABj + (i + 1) * buf_stride, row2); - - // Calculate the new H - D. - ldiff1 = _mm256_set1_epi32( - _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); - ldiff2 = _mm256_set1_epi32( - _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); - } - } -} - -// Compute 8 values of boxsum from the given integral image. ii should point -// at the middle of the box (for the first value). r is the box radius. -static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { - const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); - const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); - const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); - const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); - const __m256i u = _mm256_sub_epi32(tr, tl); - const __m256i v = _mm256_sub_epi32(br, bl); - return _mm256_sub_epi32(v, u); -} - -static __m256i round_for_shift(unsigned shift) { - return _mm256_set1_epi32((1 << shift) >> 1); -} - -static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { - __m256i an, bb; - if (bit_depth > 8) { - const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); - const __m256i rounding_b = round_for_shift(bit_depth - 8); - const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); - const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); - const __m256i a = - _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); - const __m256i b = - _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); - // b < 2^14, so we can use a 16-bit madd rather than a 32-bit - // mullo to square it - bb = _mm256_madd_epi16(b, b); - an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); - } else { - bb = _mm256_madd_epi16(sum1, sum1); - an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); - } - return _mm256_sub_epi32(an, bb); -} - -// Assumes that C, D are integral images for the original buffer which has been -// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels -// on the sides. A, B, C, D point at logical position (0, 0). -static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, - int width, int height, int buf_stride, int bit_depth, - int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; - const int n = (2 * r + 1) * (2 * r + 1); - const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); - // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); - - const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); - const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); - - // Set up masks - const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); - __m256i mask[8]; - for (int idx = 0; idx < 8; idx++) { - const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); - mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); - } - - for (int i = -1; i < height + 1; ++i) { - for (int j = -1; j < width + 1; j += 8) { - const int32_t *Cij = C + i * buf_stride + j; - const int32_t *Dij = D + i * buf_stride + j; - - __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); - __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); - - // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain - // some uninitialised data in their upper words. We use a mask to - // ensure that these bits are set to 0. - int idx = AOMMIN(8, width + 1 - j); - assert(idx >= 1); - - if (idx < 8) { - sum1 = _mm256_and_si256(mask[idx], sum1); - sum2 = _mm256_and_si256(mask[idx], sum2); - } - - const __m256i p = compute_p(sum1, sum2, bit_depth, n); - - const __m256i z = _mm256_min_epi32( - _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), - SGRPROJ_MTABLE_BITS), - _mm256_set1_epi32(255)); - - const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); - - yy_storeu_256(A + i * buf_stride + j, a_res); - - const __m256i a_complement = - _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); - - // sum1 might have lanes greater than 2^15, so we can't use madd to do - // multiplication involving sum1. However, a_complement and one_over_n - // are both less than 256, so we can multiply them first. - const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); - const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); - const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), - SGRPROJ_RECIP_BITS); - - yy_storeu_256(B + i * buf_stride + j, b_res); - } - } -} - -// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter -// where the outer four corners have weight 3 and all other pixels have weight -// 4. -// -// Pixels are indexed as follows: -// xtl xt xtr -// xl x xr -// xbl xb xbr -// -// buf points to x -// -// fours = xl + xt + xr + xb + x -// threes = xtl + xtr + xbr + xbl -// cross_sum = 4 * fours + 3 * threes -// = 4 * (fours + threes) - threes -// = (fours + threes) << 2 - threes -static INLINE __m256i cross_sum(const int32_t *buf, int stride) { - const __m256i xtl = yy_loadu_256(buf - 1 - stride); - const __m256i xt = yy_loadu_256(buf - stride); - const __m256i xtr = yy_loadu_256(buf + 1 - stride); - const __m256i xl = yy_loadu_256(buf - 1); - const __m256i x = yy_loadu_256(buf); - const __m256i xr = yy_loadu_256(buf + 1); - const __m256i xbl = yy_loadu_256(buf - 1 + stride); - const __m256i xb = yy_loadu_256(buf + stride); - const __m256i xbr = yy_loadu_256(buf + 1 + stride); - - const __m256i fours = _mm256_add_epi32( - xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); - const __m256i threes = - _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); - - return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), - threes); -} - -// The final filter for self-guided restoration. Computes a weighted average -// across A, B with "cross sums" (see cross_sum implementation above). -static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, - const int32_t *B, int buf_stride, const void *dgd8, - int dgd_stride, int width, int height, int highbd) { - const int nb = 5; - const __m256i rounding = - round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); - const uint8_t *dgd_real = - highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; - - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; j += 8) { - const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); - const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); - - const __m128i raw = - xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m256i src = - highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); - - __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); - __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), - SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); - - yy_storeu_256(dst + i * dst_stride + j, w); - } - } -} - -// Assumes that C, D are integral images for the original buffer which has been -// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels -// on the sides. A, B, C, D point at logical position (0, 0). -static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, - const int32_t *D, int width, int height, - int buf_stride, int bit_depth, int sgr_params_idx, - int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; - const int n = (2 * r + 1) * (2 * r + 1); - const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); - // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]); - - const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); - const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); - - // Set up masks - const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); - __m256i mask[8]; - for (int idx = 0; idx < 8; idx++) { - const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); - mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); - } - - for (int i = -1; i < height + 1; i += 2) { - for (int j = -1; j < width + 1; j += 8) { - const int32_t *Cij = C + i * buf_stride + j; - const int32_t *Dij = D + i * buf_stride + j; - - __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); - __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); - - // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain - // some uninitialised data in their upper words. We use a mask to - // ensure that these bits are set to 0. - int idx = AOMMIN(8, width + 1 - j); - assert(idx >= 1); - - if (idx < 8) { - sum1 = _mm256_and_si256(mask[idx], sum1); - sum2 = _mm256_and_si256(mask[idx], sum2); - } - - const __m256i p = compute_p(sum1, sum2, bit_depth, n); - - const __m256i z = _mm256_min_epi32( - _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), - SGRPROJ_MTABLE_BITS), - _mm256_set1_epi32(255)); - - const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4); - - yy_storeu_256(A + i * buf_stride + j, a_res); - - const __m256i a_complement = - _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); - - // sum1 might have lanes greater than 2^15, so we can't use madd to do - // multiplication involving sum1. However, a_complement and one_over_n - // are both less than 256, so we can multiply them first. - const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); - const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); - const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), - SGRPROJ_RECIP_BITS); - - yy_storeu_256(B + i * buf_stride + j, b_res); - } - } -} - -// Calculate 8 values of the "cross sum" starting at buf. -// -// Pixels are indexed like this: -// xtl xt xtr -// - buf - -// xbl xb xbr -// -// Pixels are weighted like this: -// 5 6 5 -// 0 0 0 -// 5 6 5 -// -// fives = xtl + xtr + xbl + xbr -// sixes = xt + xb -// cross_sum = 6 * sixes + 5 * fives -// = 5 * (fives + sixes) - sixes -// = (fives + sixes) << 2 + (fives + sixes) + sixes -static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { - const __m256i xtl = yy_loadu_256(buf - 1 - stride); - const __m256i xt = yy_loadu_256(buf - stride); - const __m256i xtr = yy_loadu_256(buf + 1 - stride); - const __m256i xbl = yy_loadu_256(buf - 1 + stride); - const __m256i xb = yy_loadu_256(buf + stride); - const __m256i xbr = yy_loadu_256(buf + 1 + stride); - - const __m256i fives = - _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); - const __m256i sixes = _mm256_add_epi32(xt, xb); - const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); - - return _mm256_add_epi32( - _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), - fives_plus_sixes), - sixes); -} - -// Calculate 8 values of the "cross sum" starting at buf. -// -// Pixels are indexed like this: -// xl x xr -// -// Pixels are weighted like this: -// 5 6 5 -// -// buf points to x -// -// fives = xl + xr -// sixes = x -// cross_sum = 5 * fives + 6 * sixes -// = 4 * (fives + sixes) + (fives + sixes) + sixes -// = (fives + sixes) << 2 + (fives + sixes) + sixes -static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) { - const __m256i xl = yy_loadu_256(buf - 1); - const __m256i x = yy_loadu_256(buf); - const __m256i xr = yy_loadu_256(buf + 1); - - const __m256i fives = _mm256_add_epi32(xl, xr); - const __m256i sixes = x; - - const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); - - return _mm256_add_epi32( - _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), - fives_plus_sixes), - sixes); -} - -// The final filter for the self-guided restoration. Computes a -// weighted average across A, B with "cross sums" (see cross_sum_... -// implementations above). -static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, - const int32_t *B, int buf_stride, - const void *dgd8, int dgd_stride, int width, - int height, int highbd) { - const int nb0 = 5; - const int nb1 = 4; - - const __m256i rounding0 = - round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); - const __m256i rounding1 = - round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); - - const uint8_t *dgd_real = - highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; - - for (int i = 0; i < height; ++i) { - if (!(i & 1)) { // even row - for (int j = 0; j < width; j += 8) { - const __m256i a = - cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); - const __m256i b = - cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); - - const __m128i raw = - xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m256i src = - highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); - - __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); - __m256i w = - _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), - SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); - - yy_storeu_256(dst + i * dst_stride + j, w); - } - } else { // odd row - for (int j = 0; j < width; j += 8) { - const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); - const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); - - const __m128i raw = - xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m256i src = - highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); - - __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); - __m256i w = - _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), - SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); - - yy_storeu_256(dst + i * dst_stride + j, w); - } - } - } -} - -int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, - int dgd_stride, int32_t *flt0, - int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, - int highbd) { - // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, - // Ctl and Dtl is 32-byte aligned. - const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); - - int32_t *buf = aom_memalign( - 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); - if (!buf) return -1; - - const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; - const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; - - // Adjusting the stride of A and B here appears to avoid bad cache effects, - // leading to a significant speed improvement. - // We also align the stride to a multiple of 32 bytes for efficiency. - int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); - - // The "tl" pointers point at the top-left of the initialised data for the - // array. - int32_t *Atl = buf + 0 * buf_elts + 7; - int32_t *Btl = buf + 1 * buf_elts + 7; - int32_t *Ctl = buf + 2 * buf_elts + 7; - int32_t *Dtl = buf + 3 * buf_elts + 7; - - // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note - // there's a zero row and column in A, B (integral images), so we move down - // and right one for them. - const int buf_diag_border = - SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; - - int32_t *A0 = Atl + 1 + buf_stride; - int32_t *B0 = Btl + 1 + buf_stride; - int32_t *C0 = Ctl + 1 + buf_stride; - int32_t *D0 = Dtl + 1 + buf_stride; - - // Finally, A, B, C, D point at position (0, 0). - int32_t *A = A0 + buf_diag_border; - int32_t *B = B0 + buf_diag_border; - int32_t *C = C0 + buf_diag_border; - int32_t *D = D0 + buf_diag_border; - - const int dgd_diag_border = - SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; - const uint8_t *dgd0 = dgd8 - dgd_diag_border; - - // Generate integral images from the input. C will contain sums of squares; D - // will contain just sums - if (highbd) - integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, - height_ext, Ctl, Dtl, buf_stride); - else - integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, - buf_stride); - - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - // Write to flt0 and flt1 - // If params->r == 0 we skip the corresponding filter. We only allow one of - // the radii to be 0, as having both equal to 0 would be equivalent to - // skipping SGR entirely. - assert(!(params->r[0] == 0 && params->r[1] == 0)); - assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); - assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); - - if (params->r[0] > 0) { - calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, - sgr_params_idx, 0); - final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, - width, height, highbd); - } - - if (params->r[1] > 0) { - calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, - 1); - final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, - height, highbd); - } - aom_free(buf); - return 0; -} - -void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { - int32_t *flt0 = tmpbuf; - int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; - assert(width * height <= RESTORATION_UNITPELS_MAX); - const int ret = av1_selfguided_restoration_avx2( - dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); - (void)ret; - assert(!ret); - const sgr_params_type *const params = &sgr_params[eps]; - int xq[2]; - decode_xq(xqd, xq, params); - - __m256i xq0 = _mm256_set1_epi32(xq[0]); - __m256i xq1 = _mm256_set1_epi32(xq[1]); - - for (int i = 0; i < height; ++i) { - // Calculate output in batches of 16 pixels - for (int j = 0; j < width; j += 16) { - const int k = i * width + j; - const int m = i * dst_stride + j; - - const uint8_t *dat8ij = dat8 + i * stride + j; - __m256i ep_0, ep_1; - __m128i src_0, src_1; - if (highbd) { - src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); - src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); - ep_0 = _mm256_cvtepu16_epi32(src_0); - ep_1 = _mm256_cvtepu16_epi32(src_1); - } else { - src_0 = xx_loadu_128(dat8ij); - ep_0 = _mm256_cvtepu8_epi32(src_0); - ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); - } - - const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); - const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); - - __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); - __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); - - if (params->r[0] > 0) { - const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); - v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); - - const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); - v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); - } - - if (params->r[1] > 0) { - const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); - v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); - - const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); - v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); - } - - const __m256i rounding = - round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - const __m256i w_0 = _mm256_srai_epi32( - _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - const __m256i w_1 = _mm256_srai_epi32( - _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - - if (highbd) { - // Pack into 16 bits and clamp to [0, 2^bit_depth) - // Note that packing into 16 bits messes up the order of the bits, - // so we use a permute function to correct this - const __m256i tmp = _mm256_packus_epi32(w_0, w_1); - const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); - const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); - const __m256i res = _mm256_min_epi16(tmp2, max); - yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); - } else { - // Pack into 8 bits and clamp to [0, 256) - // Note that each pack messes up the order of the bits, - // so we use a permute function to correct this - const __m256i tmp = _mm256_packs_epi32(w_0, w_1); - const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); - const __m256i res = - _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); - const __m128i res2 = - _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); - xx_storeu_128(dst8 + m, res2); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c deleted file mode 100644 index ea3f6d942..000000000 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ /dev/null @@ -1,660 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <smmintrin.h> - -#include "config/aom_config.h" -#include "config/av1_rtcd.h" - -#include "av1/common/restoration.h" -#include "aom_dsp/x86/synonyms.h" - -// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to -// 32-bit precision and return them in an SSE register. -static __m128i xx_load_extend_8_32(const void *p) { - return _mm_cvtepu8_epi32(xx_loadl_32(p)); -} - -// Load 4 halfwords from the possibly-misaligned pointer p, extend each -// halfword to 32-bit precision and return them in an SSE register. -static __m128i xx_load_extend_16_32(const void *p) { - return _mm_cvtepu16_epi32(xx_loadl_64(p)); -} - -// Compute the scan of an SSE register holding 4 32-bit integers. If the -// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2, -// x0+x1+x2+x3 -static __m128i scan_32(__m128i x) { - const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4)); - return _mm_add_epi32(x01, _mm_slli_si128(x01, 8)); -} - -// Compute two integral images from src. B sums elements; A sums their -// squares. The images are offset by one pixel, so will have width and height -// equal to width + 1, height + 1 and the first row and column will be zero. -// -// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple -// of 4. -static void integral_images(const uint8_t *src, int src_stride, int width, - int height, int32_t *A, int32_t *B, - int buf_stride) { - // Write out the zero top row - memset(A, 0, sizeof(*A) * (width + 1)); - memset(B, 0, sizeof(*B) * (width + 1)); - - const __m128i zero = _mm_setzero_si128(); - for (int i = 0; i < height; ++i) { - // Zero the left column. - A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; - - // ldiff is the difference H - D where H is the output sample immediately - // to the left and D is the output sample above it. These are scalars, - // replicated across the four lanes. - __m128i ldiff1 = zero, ldiff2 = zero; - for (int j = 0; j < width; j += 4) { - const int ABj = 1 + j; - - const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); - const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); - - const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride); - const __m128i x2 = _mm_madd_epi16(x1, x1); - - const __m128i sc1 = scan_32(x1); - const __m128i sc2 = scan_32(x2); - - const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); - const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); - - xx_store_128(B + ABj + (i + 1) * buf_stride, row1); - xx_store_128(A + ABj + (i + 1) * buf_stride, row2); - - // Calculate the new H - D. - ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); - ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); - } - } -} - -// Compute two integral images from src. B sums elements; A sums their squares -// -// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4. -static void integral_images_highbd(const uint16_t *src, int src_stride, - int width, int height, int32_t *A, - int32_t *B, int buf_stride) { - // Write out the zero top row - memset(A, 0, sizeof(*A) * (width + 1)); - memset(B, 0, sizeof(*B) * (width + 1)); - - const __m128i zero = _mm_setzero_si128(); - for (int i = 0; i < height; ++i) { - // Zero the left column. - A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; - - // ldiff is the difference H - D where H is the output sample immediately - // to the left and D is the output sample above it. These are scalars, - // replicated across the four lanes. - __m128i ldiff1 = zero, ldiff2 = zero; - for (int j = 0; j < width; j += 4) { - const int ABj = 1 + j; - - const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); - const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); - - const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride); - const __m128i x2 = _mm_madd_epi16(x1, x1); - - const __m128i sc1 = scan_32(x1); - const __m128i sc2 = scan_32(x2); - - const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); - const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); - - xx_store_128(B + ABj + (i + 1) * buf_stride, row1); - xx_store_128(A + ABj + (i + 1) * buf_stride, row2); - - // Calculate the new H - D. - ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); - ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); - } - } -} - -// Compute 4 values of boxsum from the given integral image. ii should point -// at the middle of the box (for the first value). r is the box radius. -static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) { - const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride); - const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride); - const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride); - const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride); - const __m128i u = _mm_sub_epi32(tr, tl); - const __m128i v = _mm_sub_epi32(br, bl); - return _mm_sub_epi32(v, u); -} - -static __m128i round_for_shift(unsigned shift) { - return _mm_set1_epi32((1 << shift) >> 1); -} - -static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { - __m128i an, bb; - if (bit_depth > 8) { - const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8)); - const __m128i rounding_b = round_for_shift(bit_depth - 8); - const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); - const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); - const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a); - const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b); - // b < 2^14, so we can use a 16-bit madd rather than a 32-bit - // mullo to square it - bb = _mm_madd_epi16(b, b); - an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb); - } else { - bb = _mm_madd_epi16(sum1, sum1); - an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n)); - } - return _mm_sub_epi32(an, bb); -} - -// Assumes that C, D are integral images for the original buffer which has been -// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels -// on the sides. A, B, C, D point at logical position (0, 0). -static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, - int width, int height, int buf_stride, int bit_depth, - int sgr_params_idx, int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; - const int n = (2 * r + 1) * (2 * r + 1); - const __m128i s = _mm_set1_epi32(params->s[radius_idx]); - // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]); - - const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); - const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); - - // Set up masks - const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); - __m128i mask[4]; - for (int idx = 0; idx < 4; idx++) { - const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); - mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); - } - - for (int i = -1; i < height + 1; ++i) { - for (int j = -1; j < width + 1; j += 4) { - const int32_t *Cij = C + i * buf_stride + j; - const int32_t *Dij = D + i * buf_stride + j; - - __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); - __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); - - // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain - // some uninitialised data in their upper words. We use a mask to - // ensure that these bits are set to 0. - int idx = AOMMIN(4, width + 1 - j); - assert(idx >= 1); - - if (idx < 4) { - sum1 = _mm_and_si128(mask[idx], sum1); - sum2 = _mm_and_si128(mask[idx], sum2); - } - - const __m128i p = compute_p(sum1, sum2, bit_depth, n); - - const __m128i z = _mm_min_epi32( - _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), - SGRPROJ_MTABLE_BITS), - _mm_set1_epi32(255)); - - // 'Gather' type instructions are not available pre-AVX2, so synthesize a - // gather using scalar loads. - const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)], - x_by_xplus1[_mm_extract_epi32(z, 2)], - x_by_xplus1[_mm_extract_epi32(z, 1)], - x_by_xplus1[_mm_extract_epi32(z, 0)]); - - xx_storeu_128(A + i * buf_stride + j, a_res); - - const __m128i a_complement = - _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); - - // sum1 might have lanes greater than 2^15, so we can't use madd to do - // multiplication involving sum1. However, a_complement and one_over_n - // are both less than 256, so we can multiply them first. - const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); - const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); - const __m128i b_res = - _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); - - xx_storeu_128(B + i * buf_stride + j, b_res); - } - } -} - -// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter -// where the outer four corners have weight 3 and all other pixels have weight -// 4. -// -// Pixels are indexed like this: -// xtl xt xtr -// xl x xr -// xbl xb xbr -// -// buf points to x -// -// fours = xl + xt + xr + xb + x -// threes = xtl + xtr + xbr + xbl -// cross_sum = 4 * fours + 3 * threes -// = 4 * (fours + threes) - threes -// = (fours + threes) << 2 - threes -static INLINE __m128i cross_sum(const int32_t *buf, int stride) { - const __m128i xtl = xx_loadu_128(buf - 1 - stride); - const __m128i xt = xx_loadu_128(buf - stride); - const __m128i xtr = xx_loadu_128(buf + 1 - stride); - const __m128i xl = xx_loadu_128(buf - 1); - const __m128i x = xx_loadu_128(buf); - const __m128i xr = xx_loadu_128(buf + 1); - const __m128i xbl = xx_loadu_128(buf - 1 + stride); - const __m128i xb = xx_loadu_128(buf + stride); - const __m128i xbr = xx_loadu_128(buf + 1 + stride); - - const __m128i fours = _mm_add_epi32( - xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); - const __m128i threes = - _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); - - return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); -} - -// The final filter for self-guided restoration. Computes a weighted average -// across A, B with "cross sums" (see cross_sum implementation above). -static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, - const int32_t *B, int buf_stride, const void *dgd8, - int dgd_stride, int width, int height, int highbd) { - const int nb = 5; - const __m128i rounding = - round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); - const uint8_t *dgd_real = - highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; - - for (int i = 0; i < height; ++i) { - for (int j = 0; j < width; j += 4) { - const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride); - const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride); - const __m128i raw = - xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m128i src = - highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); - - __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); - __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding), - SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); - - xx_storeu_128(dst + i * dst_stride + j, w); - } - } -} - -// Assumes that C, D are integral images for the original buffer which has been -// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels -// on the sides. A, B, C, D point at logical position (0, 0). -static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, - const int32_t *D, int width, int height, - int buf_stride, int bit_depth, int sgr_params_idx, - int radius_idx) { - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - const int r = params->r[radius_idx]; - const int n = (2 * r + 1) * (2 * r + 1); - const __m128i s = _mm_set1_epi32(params->s[radius_idx]); - // one_over_n[n-1] is 2^12/n, so easily fits in an int16 - const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]); - - const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); - const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); - - // Set up masks - const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); - __m128i mask[4]; - for (int idx = 0; idx < 4; idx++) { - const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); - mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); - } - - for (int i = -1; i < height + 1; i += 2) { - for (int j = -1; j < width + 1; j += 4) { - const int32_t *Cij = C + i * buf_stride + j; - const int32_t *Dij = D + i * buf_stride + j; - - __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); - __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); - - // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain - // some uninitialised data in their upper words. We use a mask to - // ensure that these bits are set to 0. - int idx = AOMMIN(4, width + 1 - j); - assert(idx >= 1); - - if (idx < 4) { - sum1 = _mm_and_si128(mask[idx], sum1); - sum2 = _mm_and_si128(mask[idx], sum2); - } - - const __m128i p = compute_p(sum1, sum2, bit_depth, n); - - const __m128i z = _mm_min_epi32( - _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), - SGRPROJ_MTABLE_BITS), - _mm_set1_epi32(255)); - - // 'Gather' type instructions are not available pre-AVX2, so synthesize a - // gather using scalar loads. - const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)], - x_by_xplus1[_mm_extract_epi32(z, 2)], - x_by_xplus1[_mm_extract_epi32(z, 1)], - x_by_xplus1[_mm_extract_epi32(z, 0)]); - - xx_storeu_128(A + i * buf_stride + j, a_res); - - const __m128i a_complement = - _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); - - // sum1 might have lanes greater than 2^15, so we can't use madd to do - // multiplication involving sum1. However, a_complement and one_over_n - // are both less than 256, so we can multiply them first. - const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); - const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); - const __m128i b_res = - _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); - - xx_storeu_128(B + i * buf_stride + j, b_res); - } - } -} - -// Calculate 4 values of the "cross sum" starting at buf. -// -// Pixels are indexed like this: -// xtl xt xtr -// - buf - -// xbl xb xbr -// -// Pixels are weighted like this: -// 5 6 5 -// 0 0 0 -// 5 6 5 -// -// fives = xtl + xtr + xbl + xbr -// sixes = xt + xb -// cross_sum = 6 * sixes + 5 * fives -// = 5 * (fives + sixes) - sixes -// = (fives + sixes) << 2 + (fives + sixes) + sixes -static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) { - const __m128i xtl = xx_loadu_128(buf - 1 - stride); - const __m128i xt = xx_loadu_128(buf - stride); - const __m128i xtr = xx_loadu_128(buf + 1 - stride); - const __m128i xbl = xx_loadu_128(buf - 1 + stride); - const __m128i xb = xx_loadu_128(buf + stride); - const __m128i xbr = xx_loadu_128(buf + 1 + stride); - - const __m128i fives = - _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); - const __m128i sixes = _mm_add_epi32(xt, xb); - const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); - - return _mm_add_epi32( - _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), - sixes); -} - -// Calculate 4 values of the "cross sum" starting at buf. -// -// Pixels are indexed like this: -// xl x xr -// -// Pixels are weighted like this: -// 5 6 5 -// -// buf points to x -// -// fives = xl + xr -// sixes = x -// cross_sum = 5 * fives + 6 * sixes -// = 4 * (fives + sixes) + (fives + sixes) + sixes -// = (fives + sixes) << 2 + (fives + sixes) + sixes -static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) { - const __m128i xl = xx_loadu_128(buf - 1); - const __m128i x = xx_loadu_128(buf); - const __m128i xr = xx_loadu_128(buf + 1); - - const __m128i fives = _mm_add_epi32(xl, xr); - const __m128i sixes = x; - - const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); - - return _mm_add_epi32( - _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), - sixes); -} - -// The final filter for the self-guided restoration. Computes a -// weighted average across A, B with "cross sums" (see cross_sum_... -// implementations above). -static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, - const int32_t *B, int buf_stride, - const void *dgd8, int dgd_stride, int width, - int height, int highbd) { - const int nb0 = 5; - const int nb1 = 4; - - const __m128i rounding0 = - round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); - const __m128i rounding1 = - round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); - - const uint8_t *dgd_real = - highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; - - for (int i = 0; i < height; ++i) { - if (!(i & 1)) { // even row - for (int j = 0; j < width; j += 4) { - const __m128i a = - cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); - const __m128i b = - cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); - const __m128i raw = - xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m128i src = - highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); - - __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); - __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0), - SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); - - xx_storeu_128(dst + i * dst_stride + j, w); - } - } else { // odd row - for (int j = 0; j < width; j += 4) { - const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j); - const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j); - const __m128i raw = - xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); - const __m128i src = - highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); - - __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); - __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1), - SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); - - xx_storeu_128(dst + i * dst_stride + j, w); - } - } - } -} - -int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, - int height, int dgd_stride, int32_t *flt0, - int32_t *flt1, int flt_stride, - int sgr_params_idx, int bit_depth, - int highbd) { - int32_t *buf = (int32_t *)aom_memalign( - 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); - if (!buf) return -1; - memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); - - const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; - const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; - - // Adjusting the stride of A and B here appears to avoid bad cache effects, - // leading to a significant speed improvement. - // We also align the stride to a multiple of 16 bytes for efficiency. - int buf_stride = ((width_ext + 3) & ~3) + 16; - - // The "tl" pointers point at the top-left of the initialised data for the - // array. Adding 3 here ensures that column 1 is 16-byte aligned. - int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3; - int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3; - int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3; - int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3; - - // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note - // there's a zero row and column in A, B (integral images), so we move down - // and right one for them. - const int buf_diag_border = - SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; - - int32_t *A0 = Atl + 1 + buf_stride; - int32_t *B0 = Btl + 1 + buf_stride; - int32_t *C0 = Ctl + 1 + buf_stride; - int32_t *D0 = Dtl + 1 + buf_stride; - - // Finally, A, B, C, D point at position (0, 0). - int32_t *A = A0 + buf_diag_border; - int32_t *B = B0 + buf_diag_border; - int32_t *C = C0 + buf_diag_border; - int32_t *D = D0 + buf_diag_border; - - const int dgd_diag_border = - SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; - const uint8_t *dgd0 = dgd8 - dgd_diag_border; - - // Generate integral images from the input. C will contain sums of squares; D - // will contain just sums - if (highbd) - integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, - height_ext, Ctl, Dtl, buf_stride); - else - integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, - buf_stride); - - const sgr_params_type *const params = &sgr_params[sgr_params_idx]; - // Write to flt0 and flt1 - // If params->r == 0 we skip the corresponding filter. We only allow one of - // the radii to be 0, as having both equal to 0 would be equivalent to - // skipping SGR entirely. - assert(!(params->r[0] == 0 && params->r[1] == 0)); - assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); - assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); - - if (params->r[0] > 0) { - calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, - sgr_params_idx, 0); - final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, - width, height, highbd); - } - - if (params->r[1] > 0) { - calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, - 1); - final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, - height, highbd); - } - aom_free(buf); - return 0; -} - -void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, - int height, int stride, int eps, - const int *xqd, uint8_t *dst8, - int dst_stride, int32_t *tmpbuf, - int bit_depth, int highbd) { - int32_t *flt0 = tmpbuf; - int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; - assert(width * height <= RESTORATION_UNITPELS_MAX); - const int ret = av1_selfguided_restoration_sse4_1( - dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); - (void)ret; - assert(!ret); - const sgr_params_type *const params = &sgr_params[eps]; - int xq[2]; - decode_xq(xqd, xq, params); - - __m128i xq0 = _mm_set1_epi32(xq[0]); - __m128i xq1 = _mm_set1_epi32(xq[1]); - - for (int i = 0; i < height; ++i) { - // Calculate output in batches of 8 pixels - for (int j = 0; j < width; j += 8) { - const int k = i * width + j; - const int m = i * dst_stride + j; - - const uint8_t *dat8ij = dat8 + i * stride + j; - __m128i src; - if (highbd) { - src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); - } else { - src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij)); - } - - const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS); - const __m128i u_0 = _mm_cvtepu16_epi32(u); - const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8)); - - __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS); - __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS); - - if (params->r[0] > 0) { - const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0); - v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0)); - - const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1); - v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1)); - } - - if (params->r[1] > 0) { - const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0); - v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0)); - - const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1); - v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1)); - } - - const __m128i rounding = - round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding), - SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding), - SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); - - if (highbd) { - // Pack into 16 bits and clamp to [0, 2^bit_depth) - const __m128i tmp = _mm_packus_epi32(w_0, w_1); - const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1); - const __m128i res = _mm_min_epi16(tmp, max); - xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res); - } else { - // Pack into 8 bits and clamp to [0, 256) - const __m128i tmp = _mm_packs_epi32(w_0, w_1); - const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */); - xx_storel_64(dst8 + m, res); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c deleted file mode 100644 index b810cea2e..000000000 --- a/third_party/aom/av1/common/x86/warp_plane_sse4.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> -#include <smmintrin.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/warped_motion.h" - -/* This is a modified version of 'warped_filter' from warped_motion.c: - * Each coefficient is stored in 8 bits instead of 16 bits - * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 - - This is done in order to avoid overflow: Since the tap with the largest - coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation - order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular - convolve functions. - - Instead, we use the summation order - ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). - The rearrangement of coefficients in this table is so that we can get the - coefficients into the correct order more quickly. -*/ -/* clang-format off */ -DECLARE_ALIGNED(8, static const int8_t, - filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { -#if WARPEDPIXEL_PREC_BITS == 6 - // [-1, 0) - { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, - { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, - { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, - { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, - { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, - { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, - { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, - { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, - { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, - { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, - { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, - { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, - { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, - { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, - { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, - { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, - { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, - { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, - { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, - { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, - { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, - { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, - { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, - { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, - { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, - { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, - { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, - { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, - { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, - { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, - { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, - { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, - // [0, 1) - { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, - { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, - { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, - {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, - {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, - {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, - {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, - {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, - {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, - {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, - {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, - {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, - {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, - {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, - {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, - {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, - {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, - {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, - {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, - {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, - {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, - {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, - {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, - {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, - {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, - {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, - {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, - {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, - {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, - {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, - { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, - { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, - // [1, 2) - { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, - { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, - { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, - { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, - { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, - { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, - { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, - { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, - { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, - { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, - { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, - { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, - { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, - { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, - { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, - { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, - { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, - { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, - { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, - { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, - { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, - { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, - { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, - { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, - { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, - { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, - { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, - { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, - { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, - { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, - { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, - { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, - // dummy (replicate row index 191) - { 0, 0, 2, -1, 0, 0, 127, 0}, - -#else - // [-1, 0) - { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, - { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, - { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, - { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, - { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, - { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, - { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, - { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, - { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, - { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, - { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, - { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, - { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, - { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, - { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, - { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, - // [0, 1) - { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, - { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1}, - {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, - {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, - {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2}, - {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, - {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, - {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, - {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, - {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, - {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, - {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, - {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, - {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1}, - {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, - { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, - // [1, 2) - { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, - { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, - { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, - { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, - { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, - { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, - { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4}, - { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, - { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, - { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, - { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, - { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, - { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, - { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2}, - { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, - { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, - // dummy (replicate row index 95) - { 0, 0, 4, -3, 0, -1, 127, 1}, -#endif // WARPEDPIXEL_PREC_BITS == 6 -}; -/* clang-format on */ - -// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 -// in an SSE register into two sequences: -// 0, 2, 2, 4, ..., 12, 12, 14, <don't care> -// 1, 3, 3, 5, ..., 13, 13, 15, <don't care> -static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8, - 8, 10, 10, 12, 12, 14, 14, 0 }; -static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9, - 9, 11, 11, 13, 13, 15, 15, 0 }; - -static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1 }; - -static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3, - 2, 3, 2, 3, 2, 3, 2, 3 }; - -static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5, - 4, 5, 4, 5, 4, 5, 4, 5 }; - -static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7, - 6, 7, 6, 7, 6, 7, 6, 7 }; - -static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3 }; -static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, - 4, 5, 6, 7, 4, 5, 6, 7 }; -static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, - 8, 9, 10, 11, 8, 9, 10, 11 }; -static const uint8_t shuffle_gamma0_mask3[16] = { - 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 -}; - -static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, - const int offset_bits_horiz, - const int reduce_bits_horiz, int k) { - const __m128i src_even = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask)); - const __m128i src_odd = - _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask)); - // The pixel order we need for 'src' is: - // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 - const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); - const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); - // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 - const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), - _mm_srli_si128(src_odd, 4)); - const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); - // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 - const __m128i src_13 = - _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); - const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); - // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 - const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), - _mm_srli_si128(src_even, 6)); - const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); - - const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + - ((1 << reduce_bits_horiz) >> 1)); - - // Note: The values res_02 + res_46 and res_13 + res_57 both - // fit into int16s at this point, but their sum may be too wide to fit - // into an int16. However, once we also add round_const, the sum of - // all of these fits into a uint16. - // - // The wrapping behaviour of _mm_add_* is used here to make sure we - // get the correct result despite converting between different - // (implicit) types. - const __m128i res_even = _mm_add_epi16(res_02, res_46); - const __m128i res_odd = _mm_add_epi16(res_13, res_57); - const __m128i res = - _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); - tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); -} - -static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, - __m128i *coeff) { - // Filter even-index pixels - const __m128i tmp_0 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_1 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_2 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_3 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_4 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_5 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_6 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); - const __m128i tmp_7 = _mm_loadl_epi64( - (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); - - // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 - const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); - // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 - const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); - // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 - const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); - // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 - const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); - - // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 - const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); - // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); - // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 - const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); - // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); - - // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); - // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); - // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); - // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); -} - -static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, - __m128i *coeff) { - // Filter even-index pixels - const __m128i tmp_0 = - _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); - - // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 - coeff[0] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01)); - // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 - coeff[1] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23)); - // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 - coeff[2] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45)); - // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 - coeff[3] = _mm_shuffle_epi8( - tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67)); -} - -static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, - int alpha, int k, - const int offset_bits_horiz, - const int reduce_bits_horiz) { - __m128i coeff[4]; - prepare_horizontal_filter_coeff(alpha, sx, coeff); - filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); -} - -static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, - int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, - int p_height, int height, int i, - const int offset_bits_horiz, - const int reduce_bits_horiz) { - int k; - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } -} - -static INLINE void warp_horizontal_filter_alpha0( - const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)alpha; - int k; - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - - __m128i coeff[4]; - prepare_horizontal_filter_coeff_alpha0(sx, coeff); - filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); - } -} - -static INLINE void warp_horizontal_filter_beta0( - const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)beta; - int k; - __m128i coeff[4]; - prepare_horizontal_filter_coeff(alpha, sx4, coeff); - - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); - } -} - -static INLINE void warp_horizontal_filter_alpha0_beta0( - const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - (void)beta; - (void)alpha; - int k; - - __m128i coeff[4]; - prepare_horizontal_filter_coeff_alpha0(sx4, coeff); - - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - - // Load source pixels - const __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); - } -} - -static INLINE void unpack_weights_and_set_round_const( - ConvolveParams *conv_params, const int round_bits, const int offset_bits, - __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { - *res_sub_const = - _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - - (1 << (offset_bits - conv_params->round_1 - 1))); - *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - *wt = _mm_unpacklo_epi16(wt0, wt1); -} - -static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, - __m128i *coeffs) { - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_2 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_4 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_6 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); - const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); - const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); - const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); - - // even coeffs - coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); - coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); - coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); - coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); - - const __m128i tmp_1 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_3 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_5 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); - const __m128i tmp_7 = _mm_loadu_si128( - (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); - - const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); - const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); - const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); - const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); - - // odd coeffs - coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); - coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); - coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); - coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); -} - -static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, - __m128i *coeffs) { - const __m128i tmp_0 = _mm_loadu_si128( - (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); - - // even coeffs - coeffs[0] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0)); - coeffs[1] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1)); - coeffs[2] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2)); - coeffs[3] = - _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3)); - - // odd coeffs - coeffs[4] = coeffs[0]; - coeffs[5] = coeffs[1]; - coeffs[6] = coeffs[2]; - coeffs[7] = coeffs[3]; -} - -static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, - __m128i *res_lo, __m128i *res_hi, - int k) { - // Load from tmp and rearrange pairs of consecutive rows into the - // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 - const __m128i *src = tmp + (k + 4); - const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); - const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); - const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); - const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); - const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); - const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); - const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); - - const __m128i res_even = - _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); - const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); - const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); - const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); - const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); - const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); - const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); - - const __m128i res_odd = - _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - *res_lo = _mm_unpacklo_epi32(res_even, res_odd); - *res_hi = _mm_unpackhi_epi32(res_even, res_odd); -} - -static INLINE void store_vertical_filter_output( - __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, - const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, - uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, - const int reduce_bits_vert, int p_stride, int p_width, - const int round_bits) { - __m128i res_lo_1 = *res_lo; - __m128i res_hi_1 = *res_hi; - - if (conv_params->is_compound) { - __m128i *const p = - (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; - res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), - reduce_bits_vert); - const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); - __m128i res_lo_16; - if (conv_params->do_average) { - __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - const __m128i p_16 = _mm_loadl_epi64(p); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); - const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); - res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); - } - - res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); - - res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), - round_bits); - __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); - *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo); - } else { - _mm_storel_epi64(p, temp_lo_16); - } - if (p_width > 4) { - __m128i *const p4 = - (__m128i *)&conv_params - ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; - res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), - reduce_bits_vert); - const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); - __m128i res_hi_16; - - if (conv_params->do_average) { - __m128i *const dst8_4 = - (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; - const __m128i p4_16 = _mm_loadl_epi64(p4); - - if (conv_params->use_jnt_comp_avg) { - const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); - const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); - const __m128i shifted_32 = - _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); - res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); - } else { - res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); - } - res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); - - res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), - round_bits); - __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); - *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); - - } else { - _mm_storel_epi64(p4, temp_hi_16); - } - } - } else { - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - // Store, blending with 'pred' if needed - __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; - - // Note: If we're outputting a 4x4 block, we need to be very careful - // to only output 4 pixels at this point, to avoid encode/decode - // mismatches when encoding with multiple threads. - if (p_width == 4) { - *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); - } else { - _mm_storel_epi64(p, res_8bit); - } - } -} - -static INLINE void warp_vertical_filter( - uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, - int16_t delta, int p_height, int p_stride, int p_width, int i, int j, - int sy4, const int reduce_bits_vert, const __m128i *res_add_const, - const int round_bits, const int offset_bits) { - int k; - __m128i res_sub_const, round_bits_const, wt; - unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, - &res_sub_const, &round_bits_const, &wt); - // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + delta * (k + 4); - - __m128i coeffs[8]; - prepare_vertical_filter_coeffs(gamma, sy, coeffs); - - __m128i res_lo; - __m128i res_hi; - filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); - - store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, - &res_sub_const, &round_bits_const, pred, - conv_params, i, j, k, reduce_bits_vert, - p_stride, p_width, round_bits); - } -} - -static INLINE void warp_vertical_filter_gamma0( - uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, - int16_t delta, int p_height, int p_stride, int p_width, int i, int j, - int sy4, const int reduce_bits_vert, const __m128i *res_add_const, - const int round_bits, const int offset_bits) { - int k; - (void)gamma; - __m128i res_sub_const, round_bits_const, wt; - unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, - &res_sub_const, &round_bits_const, &wt); - // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - int sy = sy4 + delta * (k + 4); - - __m128i coeffs[8]; - prepare_vertical_filter_coeffs_gamma0(sy, coeffs); - - __m128i res_lo; - __m128i res_hi; - filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); - - store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, - &res_sub_const, &round_bits_const, pred, - conv_params, i, j, k, reduce_bits_vert, - p_stride, p_width, round_bits); - } -} - -static INLINE void warp_vertical_filter_delta0( - uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, - int16_t delta, int p_height, int p_stride, int p_width, int i, int j, - int sy4, const int reduce_bits_vert, const __m128i *res_add_const, - const int round_bits, const int offset_bits) { - (void)delta; - int k; - __m128i res_sub_const, round_bits_const, wt; - unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, - &res_sub_const, &round_bits_const, &wt); - - __m128i coeffs[8]; - prepare_vertical_filter_coeffs(gamma, sy4, coeffs); - // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - __m128i res_lo; - __m128i res_hi; - filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); - - store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, - &res_sub_const, &round_bits_const, pred, - conv_params, i, j, k, reduce_bits_vert, - p_stride, p_width, round_bits); - } -} - -static INLINE void warp_vertical_filter_gamma0_delta0( - uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, - int16_t delta, int p_height, int p_stride, int p_width, int i, int j, - int sy4, const int reduce_bits_vert, const __m128i *res_add_const, - const int round_bits, const int offset_bits) { - (void)delta; - (void)gamma; - int k; - __m128i res_sub_const, round_bits_const, wt; - unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, - &res_sub_const, &round_bits_const, &wt); - - __m128i coeffs[8]; - prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); - // Vertical filter - for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { - __m128i res_lo; - __m128i res_hi; - filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); - - store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, - &res_sub_const, &round_bits_const, pred, - conv_params, i, j, k, reduce_bits_vert, - p_stride, p_width, round_bits); - } -} - -static INLINE void prepare_warp_vertical_filter( - uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, - int16_t delta, int p_height, int p_stride, int p_width, int i, int j, - int sy4, const int reduce_bits_vert, const __m128i *res_add_const, - const int round_bits, const int offset_bits) { - if (gamma == 0 && delta == 0) - warp_vertical_filter_gamma0_delta0( - pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, - sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); - else if (gamma == 0 && delta != 0) - warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, - p_stride, p_width, i, j, sy4, reduce_bits_vert, - res_add_const, round_bits, offset_bits); - else if (gamma != 0 && delta == 0) - warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, - p_stride, p_width, i, j, sy4, reduce_bits_vert, - res_add_const, round_bits, offset_bits); - else - warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, - p_stride, p_width, i, j, sy4, reduce_bits_vert, - res_add_const, round_bits, offset_bits); -} - -static INLINE void prepare_warp_horizontal_filter( - const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, - int32_t sx4, int alpha, int beta, int p_height, int height, int i, - const int offset_bits_horiz, const int reduce_bits_horiz) { - if (alpha == 0 && beta == 0) - warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, - beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - else if (alpha == 0 && beta != 0) - warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, - p_height, height, i, offset_bits_horiz, - reduce_bits_horiz); - else if (alpha != 0 && beta == 0) - warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, - p_height, height, i, offset_bits_horiz, - reduce_bits_horiz); - else - warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, - p_height, height, i, offset_bits_horiz, - reduce_bits_horiz); -} - -void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, - int height, int stride, uint8_t *pred, int p_col, - int p_row, int p_width, int p_height, int p_stride, - int subsampling_x, int subsampling_y, - ConvolveParams *conv_params, int16_t alpha, - int16_t beta, int16_t gamma, int16_t delta) { - __m128i tmp[15]; - int i, j, k; - const int bd = 8; - const int reduce_bits_horiz = conv_params->round_0; - const int reduce_bits_vert = conv_params->is_compound - ? conv_params->round_1 - : 2 * FILTER_BITS - reduce_bits_horiz; - const int offset_bits_horiz = bd + FILTER_BITS - 1; - assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); - - const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; - const __m128i reduce_bits_vert_const = - _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); - const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); - const int round_bits = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); - - /* Note: For this code to work, the left/right frame borders need to be - extended by at least 13 pixels each. By the time we get here, other - code will have set up this border, but we allow an explicit check - for debugging purposes. - */ - /*for (i = 0; i < height; ++i) { - for (j = 0; j < 13; ++j) { - assert(ref[i * stride - 13 + j] == ref[i * stride]); - assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); - } - }*/ - __m128i res_add_const_1; - if (conv_params->is_compound == 1) { - res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); - } else { - res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + - ((1 << reduce_bits_vert) >> 1)); - } - - for (i = 0; i < p_height; i += 8) { - for (j = 0; j < p_width; j += 8) { - const int32_t src_x = (p_col + j + 4) << subsampling_x; - const int32_t src_y = (p_row + i + 4) << subsampling_y; - const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0]; - const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1]; - const int32_t x4 = dst_x >> subsampling_x; - const int32_t y4 = dst_y >> subsampling_y; - - int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS; - int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS; - int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - - // Add in all the constant terms, including rounding and offset - sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); - sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + - (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); - - sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - - // Horizontal filter - // If the block is aligned such that, after clamping, every sample - // would be taken from the leftmost/rightmost column, then we can - // skip the expensive horizontal filter. - if (ix4 <= -7) { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - tmp[k + 7] = _mm_set1_epi16( - (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); - } - } else if (ix4 >= width + 6) { - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - tmp[k + 7] = - _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + - ref[iy * stride + (width - 1)] * - (1 << (FILTER_BITS - reduce_bits_horiz))); - } - } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; - for (k = -7; k < AOMMIN(8, p_height - i); ++k) { - int iy = iy4 + k; - if (iy < 0) - iy = 0; - else if (iy > height - 1) - iy = height - 1; - int sx = sx4 + beta * (k + 4); - - // Load source pixels - __m128i src = - _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); - if (out_of_boundary_left >= 0) { - const __m128i shuffle_reg_left = - _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); - src = _mm_shuffle_epi8(src, shuffle_reg_left); - } - if (out_of_boundary_right >= 0) { - const __m128i shuffle_reg_right = _mm_loadu_si128( - (__m128i *)warp_pad_right[out_of_boundary_right]); - src = _mm_shuffle_epi8(src, shuffle_reg_right); - } - horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, - reduce_bits_horiz); - } - } else { - prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, - beta, p_height, height, i, - offset_bits_horiz, reduce_bits_horiz); - } - - // Vertical filter - prepare_warp_vertical_filter( - pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, - j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); - } - } -} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c deleted file mode 100644 index 87a6e1239..000000000 --- a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <immintrin.h> -#include <assert.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/synonyms.h" -#include "aom_dsp/x86/synonyms_avx2.h" - -// 128-bit xmmwords are written as [ ... ] with the MSB on the left. -// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB -// on the left. -// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be -// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. -void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, - const ConvolveParams *conv_params) { - const int bd = 8; - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - DECLARE_ALIGNED(32, uint16_t, - temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 2; - memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero_128 = _mm_setzero_si128(); - const __m256i zero_256 = _mm256_setzero_si256(); - - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); - - const __m256i clamp_low = zero_256; - const __m256i clamp_high = - _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); - - /* Horizontal filter */ - { - // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] - const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); - - // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = _mm256_set1_epi32( - (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); - - for (int i = 0; i < intermediate_height; ++i) { - for (int j = 0; j < w; j += 16) { - const uint8_t *data_ij = src_ptr + i * src_stride + j; - - // Load 8-bit src data - const __m128i data_0 = xx_loadu_128(data_ij + 0); - const __m128i data_1 = xx_loadu_128(data_ij + 1); - const __m128i data_2 = xx_loadu_128(data_ij + 2); - const __m128i data_3 = xx_loadu_128(data_ij + 3); - const __m128i data_4 = xx_loadu_128(data_ij + 4); - const __m128i data_5 = xx_loadu_128(data_ij + 5); - const __m128i data_6 = xx_loadu_128(data_ij + 6); - const __m128i data_7 = xx_loadu_128(data_ij + 7); - - // (Zero-)Extend 8-bit data to 16-bit data - const __m256i src_0 = _mm256_cvtepu8_epi16(data_0); - const __m256i src_1 = _mm256_cvtepu8_epi16(data_1); - const __m256i src_2 = _mm256_cvtepu8_epi16(data_2); - const __m256i src_3 = _mm256_cvtepu8_epi16(data_3); - const __m256i src_4 = _mm256_cvtepu8_epi16(data_4); - const __m256i src_5 = _mm256_cvtepu8_epi16(data_5); - const __m256i src_6 = _mm256_cvtepu8_epi16(data_6); - const __m256i src_7 = _mm256_cvtepu8_epi16(data_7); - - // Multiply src data by filter coeffs and sum pairs - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - // Calculate scalar product for even- and odd-indices separately, - // increasing to 32-bit precision - const __m256i res_even_sum = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); - const __m256i res_odd_sum = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); - - const __m256i res_even = _mm256_srai_epi32( - _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); - const __m256i res_odd = _mm256_srai_epi32( - _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); - - // Reduce to 16-bit precision and pack even- and odd-index results - // back into one register. The _mm256_packs_epi32 intrinsic returns - // a register with the pixels ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i res = _mm256_packs_epi32(res_even, res_odd); - const __m256i res_clamped = - _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); - - // Store in a temporary array - yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); - } - } - } - - /* Vertical filter */ - { - // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ] - const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); - - // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ] - const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ] - const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ] - const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); - // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ] - const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); - // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ] - const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); - // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ] - const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); - - // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ] - const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); - // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ] - const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); - // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ] - const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); - // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ] - const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); - - const __m256i round_const = - _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - - (1 << (bd + conv_params->round_1 - 1))); - - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 16) { - const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j; - - // Load 16-bit data from the output of the horizontal filter in - // which the pixels are ordered as follows: - // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] - const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE); - const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE); - const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE); - const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE); - const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE); - const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE); - const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE); - const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE); - - // Filter the even-indices, increasing to 32-bit precision - const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); - const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); - const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); - const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); - - const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); - const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); - const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); - const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); - - const __m256i res_even = _mm256_add_epi32( - _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); - - // Filter the odd-indices, increasing to 32-bit precision - const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); - const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); - const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); - const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); - - const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); - const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); - const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); - const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); - - const __m256i res_odd = _mm256_add_epi32( - _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); - - // Pixels are currently in the following order: - // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] - // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] - // - // Rearrange the pixels into the following order: - // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] - // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] - const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); - const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); - - const __m256i res_lo_round = _mm256_srai_epi32( - _mm256_add_epi32(res_lo, round_const), conv_params->round_1); - const __m256i res_hi_round = _mm256_srai_epi32( - _mm256_add_epi32(res_hi, round_const), conv_params->round_1); - - // Reduce to 16-bit precision and pack into the correct order: - // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] - const __m256i res_16bit = - _mm256_packs_epi32(res_lo_round, res_hi_round); - - // Reduce to 8-bit precision. This messes up the order: - // [ - - - - - - - - 15 14 13 12 11 10 9 8 ] - // [ - - - - - - - - 7 6 5 4 3 2 1 0 ] - const __m256i res_8bit = - _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */); - - // Swap the two central 32-bit values to get the order: - // [ - - - - - - - - - - - - - - - - ] - // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ] - const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8); - - // Store the lower 128-bit lane in the dst array - xx_storeu_128(dst + i * dst_stride + j, - _mm256_castsi256_si128(res_8bit2)); - } - } - } -} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c deleted file mode 100644 index f9d00b733..000000000 --- a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include <emmintrin.h> -#include <assert.h> - -#include "config/av1_rtcd.h" - -#include "av1/common/convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, - const ConvolveParams *conv_params) { - const int bd = 8; - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - DECLARE_ALIGNED(16, uint16_t, - temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); - int intermediate_height = h + SUBPEL_TAPS - 2; - memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - conv_params->round_0); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - conv_params->round_0); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16( - _mm_max_epi16(res, zero), - _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1)); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - - (1 << (bd + conv_params->round_1 - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), conv_params->round_1); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), conv_params->round_1); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storel_epi64(p, res_8bit); - } - } - } -} |