summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86
diff options
context:
space:
mode:
authortrav90 <travawine@palemoon.org>2018-10-19 21:52:15 -0500
committertrav90 <travawine@palemoon.org>2018-10-19 21:52:20 -0500
commitbbcc64772580c8a979288791afa02d30bc476d2e (patch)
tree437ce94c3fdd7497508e5b55de06c6d011678597 /third_party/aom/av1/common/x86
parent14805f6ddbfb173c327768fff9f81f40ce5e81b0 (diff)
downloadUXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.gz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.lz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.tar.xz
UXP-bbcc64772580c8a979288791afa02d30bc476d2e.zip
Update aom to v1.0.0
Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0.
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c228
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c665
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_ssse3.c1034
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c839
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c82
-rw-r--r--third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c334
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c1957
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h210
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c2917
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h236
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm1d_sse4.h144
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h317
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c10
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h60
-rw-r--r--third_party/aom/av1/common/x86/cfl_avx2.c491
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h238
-rw-r--r--third_party/aom/av1/common/x86/cfl_sse2.c89
-rw-r--r--third_party/aom/av1/common/x86/cfl_ssse3.c393
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c285
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c496
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c581
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c339
-rw-r--r--third_party/aom/av1/common/x86/filterintra_sse4.c931
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c327
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c191
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c421
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c528
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c339
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c2179
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c853
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c383
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h10
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c454
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c365
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c245
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c202
-rw-r--r--third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c450
-rw-r--r--third_party/aom/av1/common/x86/idct_intrin_sse2.c1411
-rw-r--r--third_party/aom/av1/common/x86/intra_edge_sse4.c12
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c704
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c385
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c232
-rw-r--r--third_party/aom/av1/common/x86/pvq_sse4.c252
-rw-r--r--third_party/aom/av1/common/x86/pvq_sse4.h13
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c124
-rw-r--r--third_party/aom/av1/common/x86/reconinter_sse4.c153
-rw-r--r--third_party/aom/av1/common/x86/reconinter_ssse3.c116
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c719
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c2254
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse2.c359
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c (renamed from third_party/aom/av1/common/x86/warp_plane_ssse3.c)384
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c260
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c198
53 files changed, 17308 insertions, 10091 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 000000000..8aa14696f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const uint8_t *src_y;
+ uint8_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint8_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 8-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_8 = xx_loadl_64(src_x0);
+ const __m128i src1_8 = xx_loadl_64(src_x1);
+ const __m128i src2_8 = xx_loadl_64(src_x2);
+ const __m128i src3_8 = xx_loadl_64(src_x3);
+
+ // Now zero-extend up to 16-bit precision, i.e.
+ // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+ const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+ const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+ const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+ const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Pack 16-bit values into 8-bit values, i.e.
+ // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+ // -> [ 0 0 0 0 0 0 DC BA ]
+ const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+ // Write to the output
+ xx_storel_32(&dst_y[x], shifted_8);
+ }
+ }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+ const uint16_t *src_y;
+ uint16_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint16_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 16-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_16 = xx_loadu_128(src_x0);
+ const __m128i src1_16 = xx_loadu_128(src_x1);
+ const __m128i src2_16 = xx_loadu_128(src_x2);
+ const __m128i src3_16 = xx_loadu_128(src_x3);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+ // Write to the output
+ xx_storel_64(&dst_y[x], clipped_16);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
index 1f0fedb2a..6747cae01 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -12,135 +12,16 @@
#include <assert.h>
#include <smmintrin.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-// Make a mask for coefficients of 10/12 tap filters. The coefficients are
-// packed "89ab89ab". If it's a 12-tap filter, we want all 1's; if it's a
-// 10-tap filter, we want "11001100" to just match the 8,9 terms.
-static __m128i make_1012_mask(int ntaps) {
- uint32_t low = 0xffffffff;
- uint32_t high = (ntaps == 12) ? low : 0;
- return _mm_set_epi32(high, low, high, low);
-}
-
-// Zero-extend the given input operand to an entire __m128i register.
-//
-// Note that there's almost an intrinsic to do this but 32-bit Visual Studio
-// doesn't have _mm_set_epi64x so we have to do it by hand.
-static __m128i extend_32_to_128(uint32_t x) {
- return _mm_set_epi32(0, 0, 0, x);
-}
-
-// Load an SSE register from p and bitwise AND with a.
-static __m128i load_and_128i(const void *p, __m128i a) {
- const __m128d ad = _mm_castsi128_pd(a);
- const __m128d bd = _mm_load1_pd((const double *)p);
- return _mm_castpd_si128(_mm_and_pd(ad, bd));
-}
-
-// The horizontal filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// hfilter8.
-static void hfilter(const uint8_t *src, int src_stride, int32_t *dst, int w,
- int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params, unsigned round) {
- const int bd = 8;
- const int ntaps = filter_params->taps;
- assert(ntaps == 10 || ntaps == 12);
-
- src -= ntaps / 2 - 1;
-
- // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
- // out the unneeded entries.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
-
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
- // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
- // are masked out with hicoeff_mask.
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint8_t *const src0 = src_col + y * src_stride;
- const uint8_t *const src1 = src0 + 1 * src_stride;
- const uint8_t *const src2 = src0 + 2 * src_stride;
- const uint8_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 8-bit input data, so each load gets 16
- // pixels (we need at most 12)
- const __m128i data08 = _mm_loadu_si128((__m128i *)src0);
- const __m128i data18 = _mm_loadu_si128((__m128i *)src1);
- const __m128i data28 = _mm_loadu_si128((__m128i *)src2);
- const __m128i data38 = _mm_loadu_si128((__m128i *)src3);
-
- // Now zero-extend up to 16-bit precision by interleaving with zeros. For
- // the "high" pixels (8 to 11), interleave first (so that the expansion
- // to 16-bits operates on an entire register).
- const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
- const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
- const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
- const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
- const __m128i data01hi8 = _mm_unpackhi_epi32(data08, data18);
- const __m128i data23hi8 = _mm_unpackhi_epi32(data28, data38);
- const __m128i data01hi = _mm_unpacklo_epi8(data01hi8, zero);
- const __m128i data23hi = _mm_unpacklo_epi8(data23hi8, zero);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
- const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
- const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
-
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
- const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
- const __m128i conv = _mm_add_epi32(convlo, convhi);
-
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
-
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
- }
- for (; y < h; ++y) {
- const uint8_t *const src_row = src_col + y * src_stride;
-
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
- }
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
- }
- }
-}
-
// A specialised version of hfilter, the horizontal filter for
// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
int h, int subpel_x_qn, int x_step_qn,
const InterpFilterParams *filter_params, unsigned round) {
const int bd = 8;
@@ -150,7 +31,7 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
int x_qn = subpel_x_qn;
for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -197,11 +78,12 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
// Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
+ __m128i shifted =
_mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+ shifted = _mm_packus_epi32(shifted, shifted);
// Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
}
for (; y < h; ++y) {
const uint8_t *const src_row = src_col + y * src_stride;
@@ -216,256 +98,179 @@ static void hfilter8(const uint8_t *src, int src_stride, int32_t *dst, int w,
}
}
-// Do a 12-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32(const int32_t *src, __m128i coeff03, __m128i coeff47,
- __m128i coeff8d) {
- const __m128i data03 = _mm_loadu_si128((__m128i *)src);
- const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
- const __m128i data8d = _mm_loadu_si128((__m128i *)(src + 8));
- const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
- const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
- const __m128i conv8d = _mm_mullo_epi32(data8d, coeff8d);
- return _mm_add_epi32(_mm_add_epi32(conv03, conv47), conv8d);
-}
-
-// Do an 8-tap convolution with the given coefficients, loading data from src.
-static __m128i convolve_32_8(const int32_t *src, __m128i coeff03,
- __m128i coeff47) {
- const __m128i data03 = _mm_loadu_si128((__m128i *)src);
- const __m128i data47 = _mm_loadu_si128((__m128i *)(src + 4));
- const __m128i conv03 = _mm_mullo_epi32(data03, coeff03);
- const __m128i conv47 = _mm_mullo_epi32(data47, coeff47);
- return _mm_add_epi32(conv03, conv47);
-}
-
-// The vertical filter for av1_convolve_2d_scale_sse4_1. This is the more
-// general version, supporting 10 and 12 tap filters. For 8-tap filters, use
-// vfilter8.
-static void vfilter(const int32_t *src, int src_stride, int32_t *dst,
- int dst_stride, int w, int h, int subpel_y_qn,
- int y_step_qn, const InterpFilterParams *filter_params,
- const ConvolveParams *conv_params, int bd) {
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int ntaps = filter_params->taps;
-
- // Construct a mask with which we'll AND filter coefficients 89ab to zero out
- // the unneeded entries. The upper bits of this mask are unused.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
- int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(conv_params->round_1);
-
- const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1)));
- const __m128i sub = _mm_set1_epi32(sub32);
-
- int y_qn = subpel_y_qn;
- for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(filter_idx < SUBPEL_SHIFTS);
- const int16_t *filter =
- av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
-
- // Load up coefficients for the filter and sign-extend to 32-bit precision
- // (to do so, calculate sign bits and then interleave)
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
- const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi16 = load_and_128i(filter + 8, hicoeff_mask);
- const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
- const __m128i csignhi16 = _mm_cmplt_epi16(coeffhi16, zero);
- const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
- const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
- const __m128i coeff8d = _mm_unpacklo_epi16(coeffhi16, csignhi16);
-
- int x;
- for (x = 0; x <= w - 4; x += 4) {
- const int32_t *const src0 = src_y + x * src_stride;
- const int32_t *const src1 = src0 + 1 * src_stride;
- const int32_t *const src2 = src0 + 2 * src_stride;
- const int32_t *const src3 = src0 + 3 * src_stride;
-
- // Load the source data for the three rows, adding the three registers of
- // convolved products to one as we go (conv0..conv3) to avoid the
- // register pressure getting too high.
- const __m128i conv0 = convolve_32(src0, coeff03, coeff47, coeff8d);
- const __m128i conv1 = convolve_32(src1, coeff03, coeff47, coeff8d);
- const __m128i conv2 = convolve_32(src2, coeff03, coeff47, coeff8d);
- const __m128i conv3 = convolve_32(src3, coeff03, coeff47, coeff8d);
-
- // Now reduce horizontally to get one lane for each result
- const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
- const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
- const __m128i conv = _mm_hadd_epi32(conv01, conv23);
-
- // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
- const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
- int32_t *dst_x = dst + y * dst_stride + x;
- const __m128i result =
- (conv_params->do_average)
- ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
- : subbed;
-
- _mm_storeu_si128((__m128i *)dst_x, result);
- }
- for (; x < w; ++x) {
- const int32_t *src_x = src_y + x * src_stride;
- CONV_BUF_TYPE sum = 1 << offset_bits;
- for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
- }
- }
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+ __m128i data = _mm_loadu_si128((__m128i *)src);
+ return _mm_madd_epi16(data, coeff);
}
// A specialised version of vfilter, the vertical filter for
// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
-static void vfilter8(const int32_t *src, int src_stride, int32_t *dst,
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, int subpel_y_qn,
int y_step_qn, const InterpFilterParams *filter_params,
const ConvolveParams *conv_params, int bd) {
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int ntaps = 8;
- int32_t round_add32 = (1 << conv_params->round_1) / 2 + (1 << offset_bits);
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(conv_params->round_1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
- const __m128i sub = _mm_set1_epi32(sub32);
+ const __m128i sub = _mm_set1_epi16(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
int y_qn = subpel_y_qn;
for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
- const int32_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
- // Load up coefficients for the filter and sign-extend to 32-bit precision
- // (to do so, calculate sign bits and then interleave)
- const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
- const __m128i csign0716 = _mm_cmplt_epi16(coeff0716, zero);
- const __m128i coeff03 = _mm_unpacklo_epi16(coeff0716, csign0716);
- const __m128i coeff47 = _mm_unpackhi_epi16(coeff0716, csign0716);
-
int x;
for (x = 0; x <= w - 4; x += 4) {
- const int32_t *const src0 = src_y + x * src_stride;
- const int32_t *const src1 = src0 + 1 * src_stride;
- const int32_t *const src2 = src0 + 2 * src_stride;
- const int32_t *const src3 = src0 + 3 * src_stride;
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
// Load the source data for the three rows, adding the three registers of
// convolved products to one as we go (conv0..conv3) to avoid the
// register pressure getting too high.
- const __m128i conv0 = convolve_32_8(src0, coeff03, coeff47);
- const __m128i conv1 = convolve_32_8(src1, coeff03, coeff47);
- const __m128i conv2 = convolve_32_8(src2, coeff03, coeff47);
- const __m128i conv3 = convolve_32_8(src3, coeff03, coeff47);
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
// Now reduce horizontally to get one lane for each result
const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
- const __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ conv = _mm_add_epi32(conv, res_add_const);
// Divide down by (1 << round_1), rounding to nearest and subtract sub32.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
- const __m128i subbed = _mm_sub_epi32(shifted, sub);
-
- int32_t *dst_x = dst + y * dst_stride + x;
- const __m128i result =
- (conv_params->do_average)
- ? _mm_add_epi32(subbed, _mm_loadu_si128((__m128i *)dst_x))
- : subbed;
-
- _mm_storeu_si128((__m128i *)dst_x, result);
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint8_t *dst_x = dst + y * dst_stride + x;
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ __m128i result;
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+ }
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+ } else {
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+ }
}
for (; x < w; ++x) {
- const int32_t *src_x = src_y + x * src_stride;
- CONV_BUF_TYPE sum = 1 << offset_bits;
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
- CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - sub32;
- if (conv_params->do_average)
- dst[y * dst_stride + x] += res;
- else
- dst[y * dst_stride + x] = res;
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - sub32;
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
}
}
}
-
void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int x_step_qn,
const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params) {
- int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ // TODO(yaowu): remove unnecessary initializations
+ int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
const int xtaps = filter_params_x->taps;
const int ytaps = filter_params_y->taps;
-
const int fo_vert = ytaps / 2 - 1;
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
// horizontal filter
- if (xtaps == 8)
- hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
- x_step_qn, filter_params_x, conv_params->round_0);
- else
- hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
- x_step_qn, filter_params_x, conv_params->round_0);
+ hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+ x_step_qn, filter_params_x, conv_params->round_0);
// vertical filter (input is transposed)
- if (ytaps == 8)
- vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, 8);
- else
- vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, 8);
+ vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, 8);
}
-#if CONFIG_HIGHBITDEPTH
-// An wrapper to generate the SHUFPD instruction with __m128i types (just
-// writing _mm_shuffle_pd at the callsites gets a bit ugly because of the
-// casts)
-static __m128i mm_shuffle0_si128(__m128i a, __m128i b) {
- __m128d ad = _mm_castsi128_pd(a);
- __m128d bd = _mm_castsi128_pd(b);
- return _mm_castpd_si128(_mm_shuffle_pd(ad, bd, 0));
-}
-
-// The horizontal filter for av1_highbd_convolve_2d_scale_sse4_1. This
-// is the more general version, supporting 10 and 12 tap filters. For
-// 8-tap filters, use hfilter8.
-static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
- int w, int h, int subpel_x_qn, int x_step_qn,
- const InterpFilterParams *filter_params,
- unsigned round, int bd) {
- const int ntaps = filter_params->taps;
- assert(ntaps == 10 || ntaps == 12);
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params,
+ unsigned round, int bd) {
+ const int ntaps = 8;
src -= ntaps / 2 - 1;
- // Construct a mask with which we'll AND filter coefficients 89ab89ab to zero
- // out the unneeded entries.
- const __m128i hicoeff_mask = make_1012_mask(ntaps);
-
int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
int x_qn = subpel_x_qn;
for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
@@ -475,11 +280,8 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
const int16_t *filter =
av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
- // The "lo" coefficients are coefficients 0..7. For a 12-tap filter, the
- // "hi" coefficients are arranged as 89ab89ab. For a 10-tap filter, they
- // are masked out with hicoeff_mask.
+ // Load the filter coefficients
const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
- const __m128i coeffhi = load_and_128i(filter + 8, hicoeff_mask);
int y;
for (y = 0; y <= h - 4; y += 4) {
@@ -488,43 +290,31 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
const uint16_t *const src2 = src0 + 2 * src_stride;
const uint16_t *const src3 = src0 + 3 * src_stride;
- // Load up source data. This is 16-bit input data, so each load gets 8
- // pixels (we need at most 12)
+ // Load up source data. This is 16-bit input data, so each load gets the 8
+ // pixels we need.
const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
- const __m128i data0hi = _mm_loadu_si128((__m128i *)(src0 + 8));
- const __m128i data1hi = _mm_loadu_si128((__m128i *)(src1 + 8));
- const __m128i data2hi = _mm_loadu_si128((__m128i *)(src2 + 8));
- const __m128i data3hi = _mm_loadu_si128((__m128i *)(src3 + 8));
-
- // The "hi" data has rubbish in the top half so interleave pairs together
- // to minimise the calculation we need to do.
- const __m128i data01hi = mm_shuffle0_si128(data0hi, data1hi);
- const __m128i data23hi = mm_shuffle0_si128(data2hi, data3hi);
// Multiply by coefficients
const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
- const __m128i conv01hi = _mm_madd_epi16(data01hi, coeffhi);
- const __m128i conv23hi = _mm_madd_epi16(data23hi, coeffhi);
// Reduce horizontally and add
const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i convlo = _mm_hadd_epi32(conv01lo, conv23lo);
- const __m128i convhi = _mm_hadd_epi32(conv01hi, conv23hi);
- const __m128i conv = _mm_add_epi32(convlo, convhi);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
// Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
+ __m128i shifted =
_mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+ shifted = _mm_packus_epi32(shifted, shifted);
// Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
}
for (; y < h; ++y) {
const uint16_t *const src_row = src_col + y * src_stride;
@@ -538,108 +328,173 @@ static void highbd_hfilter(const uint16_t *src, int src_stride, int32_t *dst,
}
}
}
-
-// A specialised version of hfilter, the horizontal filter for
+// A specialised version of vfilter, the vertical filter for
// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
// filters.
-static void highbd_hfilter8(const uint16_t *src, int src_stride, int32_t *dst,
- int w, int h, int subpel_x_qn, int x_step_qn,
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn,
const InterpFilterParams *filter_params,
- unsigned round, int bd) {
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int ntaps = 8;
- src -= ntaps / 2 - 1;
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
- int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
- const __m128i round_add = _mm_set1_epi32(round_add32);
- const __m128i round_shift = extend_32_to_128(round);
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi32(sub32);
- int x_qn = subpel_x_qn;
- for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
- const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
- const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const __m128i clip_pixel_ =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
assert(filter_idx < SUBPEL_SHIFTS);
const int16_t *filter =
av1_get_interp_filter_subpel_kernel(*filter_params, filter_idx);
- // Load the filter coefficients
- const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
-
- int y;
- for (y = 0; y <= h - 4; y += 4) {
- const uint16_t *const src0 = src_col + y * src_stride;
- const uint16_t *const src1 = src0 + 1 * src_stride;
- const uint16_t *const src2 = src0 + 2 * src_stride;
- const uint16_t *const src3 = src0 + 3 * src_stride;
-
- // Load up source data. This is 16-bit input data, so each load gets the 8
- // pixels we need.
- const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
- const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
- const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
- const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
-
- // Multiply by coefficients
- const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
- const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
- const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
- const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
- // Reduce horizontally and add
- const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
- const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
- const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
- // Divide down by (1 << round), rounding to nearest.
- const __m128i shifted =
- _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ conv = _mm_add_epi32(conv, res_add_const);
- // Write transposed to the output
- _mm_storeu_si128((__m128i *)(dst + y + x * h), shifted);
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint16_t *dst_x = dst + y * dst_stride + x;
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+ __m128i result;
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ __m128i p_32 =
+ _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+ if (conv_params->use_jnt_comp_avg) {
+ shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(shifted, wt1));
+ shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+ } else {
+ shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+ }
+ __m128i res32 = _mm_sub_epi32(shifted, sub);
+ res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16 = _mm_packus_epi32(res32, res32);
+ res16 = _mm_min_epi16(res16, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, res16);
+ } else {
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ }
}
- for (; y < h; ++y) {
- const uint16_t *const src_row = src_col + y * src_stride;
- int32_t sum = (1 << (bd + FILTER_BITS - 1));
- for (int k = 0; k < ntaps; ++k) {
- sum += filter[k] * src_row[k];
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
}
-
- dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
}
}
}
void av1_highbd_convolve_2d_scale_sse4_1(
- const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride,
- int w, int h, InterpFilterParams *filter_params_x,
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y, const int subpel_x_qn,
const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
ConvolveParams *conv_params, int bd) {
- int32_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ // TODO(yaowu): Move this out of stack
+ DECLARE_ALIGNED(16, int16_t,
+ tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
filter_params_y->taps;
-
const int xtaps = filter_params_x->taps;
const int ytaps = filter_params_y->taps;
const int fo_vert = ytaps / 2 - 1;
+ memset(tmp, 0, sizeof(tmp));
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
// horizontal filter
- if (xtaps == 8)
- highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
- subpel_x_qn, x_step_qn, filter_params_x,
- conv_params->round_0, bd);
- else
- highbd_hfilter(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
- subpel_x_qn, x_step_qn, filter_params_x,
- conv_params->round_0, bd);
+ highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+ bd);
// vertical filter (input is transposed)
- if (ytaps == 8)
- vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, bd);
- else
- vfilter(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
- filter_params_y, conv_params, bd);
+ highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
}
-#endif // CONFIG_HIGHBITDEPTH
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
deleted file mode 100644
index e85c15eaf..000000000
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <tmmintrin.h>
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#define WIDTH_BOUND (16)
-#define HEIGHT_BOUND (16)
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int8_t,
- sub_pel_filters_12sharp_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
- sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
-#endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int8_t,
- sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
-
-DECLARE_ALIGNED(16, static int8_t,
- sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
-#endif
-
-typedef int8_t (*SubpelFilterCoeffs)[16];
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
- if (p.interp_filter == MULTITAP_SHARP) {
- return &sub_pel_filters_12sharp_signal_dir[index][0];
- }
-#endif
-#if USE_TEMPORALFILTER_12TAP
- if (p.interp_filter == TEMPORALFILTER_12TAP) {
- return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
- }
-#endif
- (void)p;
- (void)index;
- return NULL;
-}
-
-static INLINE SubpelFilterCoeffs
-get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
- if (p.interp_filter == MULTITAP_SHARP) {
- return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
- }
-#endif
-#if USE_TEMPORALFILTER_12TAP
- if (p.interp_filter == TEMPORALFILTER_12TAP) {
- return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
- }
-#endif
- (void)p;
- (void)index;
- return NULL;
-}
-
-static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
- __m128i t0, t1;
-
- t0 = _mm_unpacklo_epi16(in[0], in[1]);
- t1 = _mm_unpacklo_epi16(in[2], in[3]);
-
- out[0] = _mm_unpacklo_epi32(t0, t1);
- out[1] = _mm_srli_si128(out[0], 8);
- out[2] = _mm_unpackhi_epi32(t0, t1);
- out[3] = _mm_srli_si128(out[2], 8);
-
- t0 = _mm_unpackhi_epi16(in[0], in[1]);
- t1 = _mm_unpackhi_epi16(in[2], in[3]);
-
- out[4] = _mm_unpacklo_epi32(t0, t1);
- out[5] = _mm_srli_si128(out[4], 8);
- // Note: We ignore out[6] and out[7] because
- // they're zero vectors.
-}
-
-typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
-
-static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- __m128i y = _mm_loadl_epi64((__m128i const *)src);
- y = _mm_unpacklo_epi8(y, zero);
- y = _mm_add_epi16(*x, y);
- y = _mm_add_epi16(y, one);
- y = _mm_srai_epi16(y, 1);
- y = _mm_packus_epi16(y, y);
- return y;
-}
-
-static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
- uint32_t temp;
- __m128i u = _mm_packus_epi16(*x, *x);
- temp = _mm_cvtsi128_si32(u);
- *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
- uint32_t temp;
- __m128i y = accumulate_store(x, dst);
- temp = _mm_cvtsi128_si32(y);
- *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
- accumulate_store_2_pixel };
-
-static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
- __m128i u = _mm_packus_epi16(*x, *x);
- *(int *)dst = _mm_cvtsi128_si32(u);
-}
-
-static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
- __m128i y = accumulate_store(x, dst);
- *(int *)dst = _mm_cvtsi128_si32(y);
-}
-
-static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
- accumulate_store_4_pixel };
-
-static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store_func, uint8_t *dst) {
- __m128i sumPairRow[4];
- __m128i sumPairCol[8];
- __m128i pixel;
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i zero = _mm_setzero_si128();
-
- assert(tapsNum == 10 || tapsNum == 12);
- if (10 == tapsNum) {
- src -= 1;
- }
-
- pixel = _mm_loadu_si128((__m128i const *)src);
- sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
- sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
- sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
-
- pixel = _mm_loadu_si128((__m128i const *)(src + 1));
- sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
- sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
- sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
-
- transpose_4x8(sumPairRow, sumPairCol);
-
- sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
- sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
-
- sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
- sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
-
- sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
- sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
- sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
-
- sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
- sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
- sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
-
- store_func(&sumPairRow[1], dst);
-}
-
-static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store, uint8_t *buf) {
- horiz_w4_ssse3(src, f, tapsNum, store, buf);
- src += 4;
- buf += 4;
- horiz_w4_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store, uint8_t *buf) {
- horiz_w8_ssse3(src, f, tapsNum, store, buf);
- src += 8;
- buf += 8;
- horiz_w8_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store, uint8_t *buf) {
- horiz_w16_ssse3(src, f, tapsNum, store, buf);
- src += 16;
- buf += 16;
- horiz_w16_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store, uint8_t *buf) {
- horiz_w32_ssse3(src, f, tapsNum, store, buf);
- src += 32;
- buf += 32;
- horiz_w32_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
- store_pixel_t store, uint8_t *buf) {
- horiz_w64_ssse3(src, f, tapsNum, store, buf);
- src += 64;
- buf += 64;
- horiz_w64_ssse3(src, f, tapsNum, store, buf);
-}
-
-static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
- uint8_t *) = {
- horiz_w4_ssse3, horiz_w8_ssse3, horiz_w16_ssse3,
- horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
-};
-
-static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
- int width, store_pixel_t store, uint8_t *dst) {
- switch (width) {
- // Note:
- // For width=2 and 4, store function must be different
- case 2:
- case 4: horizTab[0](src, f, tapsNum, store, dst); break;
- case 8: horizTab[1](src, f, tapsNum, store, dst); break;
- case 16: horizTab[2](src, f, tapsNum, store, dst); break;
- case 32: horizTab[3](src, f, tapsNum, store, dst); break;
- case 64: horizTab[4](src, f, tapsNum, store, dst); break;
- case 128: horizTab[5](src, f, tapsNum, store, dst); break;
- default: assert(0);
- }
-}
-
-// Vertical 8-pixel parallel
-typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
- uint8_t *dst, int dst_stride);
-
-static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
- int src_stride, uint8_t *dst,
- int dst_stride) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- __m128i v0, v1, v2, v3;
-
- __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
- __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
- __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
- __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
- __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
- u0 = _mm_mulhrs_epi16(u0, k_256);
- u1 = _mm_mulhrs_epi16(u1, k_256);
- u2 = _mm_mulhrs_epi16(u2, k_256);
- u3 = _mm_mulhrs_epi16(u3, k_256);
- u4 = _mm_mulhrs_epi16(u4, k_256);
- u5 = _mm_mulhrs_epi16(u5, k_256);
- u6 = _mm_mulhrs_epi16(u6, k_256);
- u7 = _mm_mulhrs_epi16(u7, k_256);
-
- v0 = _mm_packus_epi16(u0, u1);
- v1 = _mm_packus_epi16(u2, u3);
- v2 = _mm_packus_epi16(u4, u5);
- v3 = _mm_packus_epi16(u6, u7);
-
- u0 = _mm_unpacklo_epi8(v0, v1);
- u1 = _mm_unpackhi_epi8(v0, v1);
- u2 = _mm_unpacklo_epi8(v2, v3);
- u3 = _mm_unpackhi_epi8(v2, v3);
-
- u4 = _mm_unpacklo_epi8(u0, u1);
- u5 = _mm_unpacklo_epi8(u2, u3);
- u6 = _mm_unpackhi_epi8(u0, u1);
- u7 = _mm_unpackhi_epi8(u2, u3);
-
- u0 = _mm_unpacklo_epi32(u4, u5);
- u1 = _mm_unpackhi_epi32(u4, u5);
- u2 = _mm_unpacklo_epi32(u6, u7);
- u3 = _mm_unpackhi_epi32(u6, u7);
-
- u4 = _mm_srli_si128(u0, 8);
- u5 = _mm_srli_si128(u1, 8);
- u6 = _mm_srli_si128(u2, 8);
- u7 = _mm_srli_si128(u3, 8);
-
- _mm_storel_epi64((__m128i *)dst, u0);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
- int src_stride, uint8_t *dst,
- int dst_stride) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
- __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
- __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
- __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
- __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
- __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
- u0 = _mm_mulhrs_epi16(u0, k_256);
- u1 = _mm_mulhrs_epi16(u1, k_256);
- u2 = _mm_mulhrs_epi16(u2, k_256);
- u3 = _mm_mulhrs_epi16(u3, k_256);
- u4 = _mm_mulhrs_epi16(u4, k_256);
- u5 = _mm_mulhrs_epi16(u5, k_256);
- u6 = _mm_mulhrs_epi16(u6, k_256);
- u7 = _mm_mulhrs_epi16(u7, k_256);
-
- v0 = _mm_packus_epi16(u0, u1);
- v1 = _mm_packus_epi16(u2, u3);
- v2 = _mm_packus_epi16(u4, u5);
- v3 = _mm_packus_epi16(u6, u7);
-
- u0 = _mm_unpacklo_epi8(v0, v1);
- u1 = _mm_unpackhi_epi8(v0, v1);
- u2 = _mm_unpacklo_epi8(v2, v3);
- u3 = _mm_unpackhi_epi8(v2, v3);
-
- u4 = _mm_unpacklo_epi8(u0, u1);
- u5 = _mm_unpacklo_epi8(u2, u3);
- u6 = _mm_unpackhi_epi8(u0, u1);
- u7 = _mm_unpackhi_epi8(u2, u3);
-
- u0 = _mm_unpacklo_epi32(u4, u5);
- u1 = _mm_unpackhi_epi32(u4, u5);
- u2 = _mm_unpacklo_epi32(u6, u7);
- u3 = _mm_unpackhi_epi32(u6, u7);
-
- u4 = _mm_srli_si128(u0, 8);
- u5 = _mm_srli_si128(u1, 8);
- u6 = _mm_srli_si128(u2, 8);
- u7 = _mm_srli_si128(u3, 8);
-
- v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
- v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
- v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
- v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
- v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
- v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
- v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
- v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
-
- u0 = _mm_unpacklo_epi8(u0, zero);
- u1 = _mm_unpacklo_epi8(u1, zero);
- u2 = _mm_unpacklo_epi8(u2, zero);
- u3 = _mm_unpacklo_epi8(u3, zero);
- u4 = _mm_unpacklo_epi8(u4, zero);
- u5 = _mm_unpacklo_epi8(u5, zero);
- u6 = _mm_unpacklo_epi8(u6, zero);
- u7 = _mm_unpacklo_epi8(u7, zero);
-
- v0 = _mm_unpacklo_epi8(v0, zero);
- v1 = _mm_unpacklo_epi8(v1, zero);
- v2 = _mm_unpacklo_epi8(v2, zero);
- v3 = _mm_unpacklo_epi8(v3, zero);
- v4 = _mm_unpacklo_epi8(v4, zero);
- v5 = _mm_unpacklo_epi8(v5, zero);
- v6 = _mm_unpacklo_epi8(v6, zero);
- v7 = _mm_unpacklo_epi8(v7, zero);
-
- v0 = _mm_adds_epi16(u0, v0);
- v1 = _mm_adds_epi16(u4, v1);
- v2 = _mm_adds_epi16(u1, v2);
- v3 = _mm_adds_epi16(u5, v3);
- v4 = _mm_adds_epi16(u2, v4);
- v5 = _mm_adds_epi16(u6, v5);
- v6 = _mm_adds_epi16(u3, v6);
- v7 = _mm_adds_epi16(u7, v7);
-
- v0 = _mm_adds_epi16(v0, one);
- v1 = _mm_adds_epi16(v1, one);
- v2 = _mm_adds_epi16(v2, one);
- v3 = _mm_adds_epi16(v3, one);
- v4 = _mm_adds_epi16(v4, one);
- v5 = _mm_adds_epi16(v5, one);
- v6 = _mm_adds_epi16(v6, one);
- v7 = _mm_adds_epi16(v7, one);
-
- v0 = _mm_srai_epi16(v0, 1);
- v1 = _mm_srai_epi16(v1, 1);
- v2 = _mm_srai_epi16(v2, 1);
- v3 = _mm_srai_epi16(v3, 1);
- v4 = _mm_srai_epi16(v4, 1);
- v5 = _mm_srai_epi16(v5, 1);
- v6 = _mm_srai_epi16(v6, 1);
- v7 = _mm_srai_epi16(v7, 1);
-
- u0 = _mm_packus_epi16(v0, v1);
- u1 = _mm_packus_epi16(v2, v3);
- u2 = _mm_packus_epi16(v4, v5);
- u3 = _mm_packus_epi16(v6, v7);
-
- u4 = _mm_srli_si128(u0, 8);
- u5 = _mm_srli_si128(u1, 8);
- u6 = _mm_srli_si128(u2, 8);
- u7 = _mm_srli_si128(u3, 8);
-
- _mm_storel_epi64((__m128i *)dst, u0);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
-}
-
-static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
- transpose8x8_accumu_to_dst };
-
-static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
- __m128i t0, t1, t2, t3, u0, u1;
-
- t0 = _mm_unpacklo_epi16(in[0], in[1]);
- t1 = _mm_unpacklo_epi16(in[2], in[3]);
- t2 = _mm_unpacklo_epi16(in[4], in[5]);
- t3 = _mm_unpacklo_epi16(in[6], in[7]);
-
- u0 = _mm_unpacklo_epi32(t0, t1);
- u1 = _mm_unpacklo_epi32(t2, t3);
-
- out[0] = _mm_unpacklo_epi64(u0, u1);
- out[1] = _mm_unpackhi_epi64(u0, u1);
-
- u0 = _mm_unpackhi_epi32(t0, t1);
- u1 = _mm_unpackhi_epi32(t2, t3);
-
- out[2] = _mm_unpacklo_epi64(u0, u1);
- out[3] = _mm_unpackhi_epi64(u0, u1);
-
- t0 = _mm_unpackhi_epi16(in[0], in[1]);
- t1 = _mm_unpackhi_epi16(in[2], in[3]);
- t2 = _mm_unpackhi_epi16(in[4], in[5]);
- t3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- u0 = _mm_unpacklo_epi32(t0, t1);
- u1 = _mm_unpacklo_epi32(t2, t3);
-
- out[4] = _mm_unpacklo_epi64(u0, u1);
- out[5] = _mm_unpackhi_epi64(u0, u1);
-
- // Ignore out[6] and out[7]
- // they're zero vectors.
-}
-
-static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- __m128i *f, int tapsNum, uint16_t *buf) {
- __m128i s[8], t[6];
- __m128i min_x2x3, max_x2x3;
- __m128i temp;
-
- assert(tapsNum == 10 || tapsNum == 12);
- if (tapsNum == 10) {
- src_ptr -= 1;
- }
- s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
- s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
- s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // TRANSPOSE...
- // Vecotor represents column pixel pairs instead of a row
- transpose_8x16(s, t);
-
- // multiply 2 adjacent elements with the filter and add the result
- s[0] = _mm_maddubs_epi16(t[0], f[0]);
- s[1] = _mm_maddubs_epi16(t[1], f[1]);
- s[2] = _mm_maddubs_epi16(t[2], f[2]);
- s[3] = _mm_maddubs_epi16(t[3], f[3]);
- s[4] = _mm_maddubs_epi16(t[4], f[4]);
- s[5] = _mm_maddubs_epi16(t[5], f[5]);
-
- // add and saturate the results together
- min_x2x3 = _mm_min_epi16(s[2], s[3]);
- max_x2x3 = _mm_max_epi16(s[2], s[3]);
- temp = _mm_adds_epi16(s[0], s[1]);
- temp = _mm_adds_epi16(temp, s[5]);
- temp = _mm_adds_epi16(temp, s[4]);
-
- temp = _mm_adds_epi16(temp, min_x2x3);
- temp = _mm_adds_epi16(temp, max_x2x3);
-
- _mm_storeu_si128((__m128i *)buf, temp);
-}
-
-// Vertical 4-pixel parallel
-static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
- int src_stride, uint8_t *dst,
- int dst_stride) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- __m128i v0, v1, v2, v3;
-
- // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
- __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
- __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
- __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
- __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpacklo_epi16(u2, u3);
-
- v2 = _mm_unpacklo_epi32(v0, v1);
- v3 = _mm_unpackhi_epi32(v0, v1);
-
- u0 = _mm_mulhrs_epi16(v2, k_256);
- u1 = _mm_mulhrs_epi16(v3, k_256);
-
- u0 = _mm_packus_epi16(u0, u1);
- u1 = _mm_srli_si128(u0, 4);
- u2 = _mm_srli_si128(u0, 8);
- u3 = _mm_srli_si128(u0, 12);
-
- *(int *)(dst) = _mm_cvtsi128_si32(u0);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
- int src_stride, uint8_t *dst,
- int dst_stride) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
-
- __m128i v0, v1, v2, v3;
-
- __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
- __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
- __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
- __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
-
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpacklo_epi16(u2, u3);
-
- v2 = _mm_unpacklo_epi32(v0, v1);
- v3 = _mm_unpackhi_epi32(v0, v1);
-
- u0 = _mm_mulhrs_epi16(v2, k_256);
- u1 = _mm_mulhrs_epi16(v3, k_256);
-
- u2 = _mm_packus_epi16(u0, u1);
- u0 = _mm_unpacklo_epi8(u2, zero);
- u1 = _mm_unpackhi_epi8(u2, zero);
-
- // load pixel values
- v0 = _mm_loadl_epi64((__m128i const *)(dst));
- v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
- v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
- v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
-
- v0 = _mm_unpacklo_epi8(v0, zero);
- v1 = _mm_unpacklo_epi8(v1, zero);
- v2 = _mm_unpacklo_epi8(v2, zero);
- v3 = _mm_unpacklo_epi8(v3, zero);
-
- v0 = _mm_unpacklo_epi64(v0, v1);
- v1 = _mm_unpacklo_epi64(v2, v3);
-
- u0 = _mm_adds_epi16(u0, v0);
- u1 = _mm_adds_epi16(u1, v1);
-
- u0 = _mm_adds_epi16(u0, one);
- u1 = _mm_adds_epi16(u1, one);
-
- u0 = _mm_srai_epi16(u0, 1);
- u1 = _mm_srai_epi16(u1, 1);
-
- // saturation and pack to pixels
- u0 = _mm_packus_epi16(u0, u1);
- u1 = _mm_srli_si128(u0, 4);
- u2 = _mm_srli_si128(u0, 8);
- u3 = _mm_srli_si128(u0, 12);
-
- *(int *)(dst) = _mm_cvtsi128_si32(u0);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
-}
-
-static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
- transpose4x4_accumu_to_dst };
-
-static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- __m128i *f, int tapsNum, uint16_t *buf) {
- __m128i A, B, C, D;
- __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
- __m128i x0, x1, x2, x3, x4, x5;
- __m128i min_x2x3, max_x2x3, temp;
-
- assert(tapsNum == 10 || tapsNum == 12);
- if (tapsNum == 10) {
- src_ptr -= 1;
- }
- A = _mm_loadu_si128((const __m128i *)src_ptr);
- B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-
- // TRANSPOSE...
- // Vecotor represents column pixel pairs instead of a row
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- tr0_1 = _mm_unpacklo_epi16(C, D);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 02 03 12 13 22 23 32 33
- s3s2 = _mm_srli_si128(s1s0, 8);
- // 06 07 16 17 26 27 36 37
- s7s6 = _mm_srli_si128(s5s4, 8);
-
- tr0_0 = _mm_unpackhi_epi16(A, B);
- tr0_1 = _mm_unpackhi_epi16(C, D);
- s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- sbsa = _mm_srli_si128(s9s8, 8);
-
- // multiply 2 adjacent elements with the filter and add the result
- x0 = _mm_maddubs_epi16(s1s0, f[0]);
- x1 = _mm_maddubs_epi16(s3s2, f[1]);
- x2 = _mm_maddubs_epi16(s5s4, f[2]);
- x3 = _mm_maddubs_epi16(s7s6, f[3]);
- x4 = _mm_maddubs_epi16(s9s8, f[4]);
- x5 = _mm_maddubs_epi16(sbsa, f[5]);
- // add and saturate the results together
- min_x2x3 = _mm_min_epi16(x2, x3);
- max_x2x3 = _mm_max_epi16(x2, x3);
- temp = _mm_adds_epi16(x0, x1);
- temp = _mm_adds_epi16(temp, x5);
- temp = _mm_adds_epi16(temp, x4);
-
- temp = _mm_adds_epi16(temp, min_x2x3);
- temp = _mm_adds_epi16(temp, max_x2x3);
- _mm_storel_epi64((__m128i *)buf, temp);
-}
-
-// Note:
-// This function assumes:
-// (1) 10/12-taps filters
-// (2) x_step_q4 = 16 then filter is fixed at the call
-
-void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams filter_params,
- const int subpel_x_q4, int x_step_q4,
- ConvolveParams *conv_params) {
- DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
- __m128i verf[6];
- __m128i horf[2];
- SubpelFilterCoeffs hCoeffs, vCoeffs;
- assert(conv_params->do_average == 0 || conv_params->do_average == 1);
- const uint8_t *src_ptr;
- store_pixel_t store2p = store2pixelTab[conv_params->do_average];
- store_pixel_t store4p = store4pixelTab[conv_params->do_average];
- transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average];
- transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average];
-
- const int tapsNum = filter_params.taps;
- int block_height, block_residu;
- int i, col, count;
- (void)x_step_q4;
-
- if (0 == subpel_x_q4 || 16 != x_step_q4) {
- av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
- subpel_x_q4, x_step_q4, conv_params);
- return;
- }
-
- hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
- vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
-
- if (!hCoeffs || !vCoeffs) {
- av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
- subpel_x_q4, x_step_q4, conv_params);
- return;
- }
-
- verf[0] = *((const __m128i *)(vCoeffs));
- verf[1] = *((const __m128i *)(vCoeffs + 1));
- verf[2] = *((const __m128i *)(vCoeffs + 2));
- verf[3] = *((const __m128i *)(vCoeffs + 3));
- verf[4] = *((const __m128i *)(vCoeffs + 4));
- verf[5] = *((const __m128i *)(vCoeffs + 5));
-
- horf[0] = *((const __m128i *)(hCoeffs));
- horf[1] = *((const __m128i *)(hCoeffs + 1));
-
- count = 0;
-
- // here tapsNum is filter size
- src -= (tapsNum >> 1) - 1;
- src_ptr = src;
- if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
- // 8-pixels parallel
- block_height = h >> 3;
- block_residu = h & 7;
-
- do {
- for (col = 0; col < w; col += 8) {
- for (i = 0; i < 8; ++i) {
- filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
- temp + (i * 8));
- src_ptr += 1;
- }
- transpose_8x8(temp, 8, dst + col, dst_stride);
- }
- count++;
- src_ptr = src + count * src_stride * 8;
- dst += dst_stride * 8;
- } while (count < block_height);
-
- for (i = 0; i < block_residu; ++i) {
- filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
- src_ptr += src_stride;
- dst += dst_stride;
- }
- } else {
- if (w > 2) {
- // 4-pixels parallel
- block_height = h >> 2;
- block_residu = h & 3;
-
- do {
- for (col = 0; col < w; col += 4) {
- for (i = 0; i < 4; ++i) {
- filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
- temp + (i * 4));
- src_ptr += 1;
- }
- transpose_4x4(temp, 4, dst + col, dst_stride);
- }
- count++;
- src_ptr = src + count * src_stride * 4;
- dst += dst_stride * 4;
- } while (count < block_height);
-
- for (i = 0; i < block_residu; ++i) {
- filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
- src_ptr += src_stride;
- dst += dst_stride;
- }
- } else {
- for (i = 0; i < h; i++) {
- filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
- src_ptr += src_stride;
- dst += dst_stride;
- }
- }
- }
-}
-
-// Vertical convolution filtering
-static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
- __m128i u = _mm_packus_epi16(*x, *x);
- _mm_storel_epi64((__m128i *)dst, u);
-}
-
-static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
- __m128i y = accumulate_store(x, dst);
- _mm_storel_epi64((__m128i *)dst, y);
-}
-
-static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
- accumulate_store_8_pixel };
-
-static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
- int tapsNum, __m128i *f) {
- __m128i s[12];
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i zero = _mm_setzero_si128();
- __m128i min_x2x3, max_x2x3, sum;
- int i = 0;
- int r = 0;
-
- if (10 == tapsNum) {
- i += 1;
- s[0] = zero;
- }
- while (i < 12) {
- s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
- i += 1;
- r += 1;
- }
-
- s[0] = _mm_unpacklo_epi8(s[0], s[1]);
- s[2] = _mm_unpacklo_epi8(s[2], s[3]);
- s[4] = _mm_unpacklo_epi8(s[4], s[5]);
- s[6] = _mm_unpacklo_epi8(s[6], s[7]);
- s[8] = _mm_unpacklo_epi8(s[8], s[9]);
- s[10] = _mm_unpacklo_epi8(s[10], s[11]);
-
- s[0] = _mm_maddubs_epi16(s[0], f[0]);
- s[2] = _mm_maddubs_epi16(s[2], f[1]);
- s[4] = _mm_maddubs_epi16(s[4], f[2]);
- s[6] = _mm_maddubs_epi16(s[6], f[3]);
- s[8] = _mm_maddubs_epi16(s[8], f[4]);
- s[10] = _mm_maddubs_epi16(s[10], f[5]);
-
- min_x2x3 = _mm_min_epi16(s[4], s[6]);
- max_x2x3 = _mm_max_epi16(s[4], s[6]);
- sum = _mm_adds_epi16(s[0], s[2]);
- sum = _mm_adds_epi16(sum, s[10]);
- sum = _mm_adds_epi16(sum, s[8]);
-
- sum = _mm_adds_epi16(sum, min_x2x3);
- sum = _mm_adds_epi16(sum, max_x2x3);
-
- sum = _mm_mulhrs_epi16(sum, k_256);
- sum = _mm_packus_epi16(sum, sum);
- sum = _mm_unpacklo_epi8(sum, zero);
- return sum;
-}
-
-static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
- __m128i *f, int tapsNum,
- store_pixel_t store_func,
- uint8_t *dst) {
- __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
- store_func(&sum, dst);
-}
-
-static void filter_vert_compute_small(const uint8_t *src, int src_stride,
- __m128i *f, int tapsNum,
- store_pixel_t store_func, int h,
- uint8_t *dst, int dst_stride) {
- int rowIndex = 0;
- do {
- filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
- dst);
- rowIndex++;
- src += src_stride;
- dst += dst_stride;
- } while (rowIndex < h);
-}
-
-static void filter_vert_compute_large(const uint8_t *src, int src_stride,
- __m128i *f, int tapsNum,
- store_pixel_t store_func, int w, int h,
- uint8_t *dst, int dst_stride) {
- int col;
- int rowIndex = 0;
- const uint8_t *src_ptr = src;
- uint8_t *dst_ptr = dst;
-
- do {
- for (col = 0; col < w; col += 8) {
- filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
- store_func, dst_ptr);
- src_ptr += 8;
- dst_ptr += 8;
- }
- rowIndex++;
- src_ptr = src + rowIndex * src_stride;
- dst_ptr = dst + rowIndex * dst_stride;
- } while (rowIndex < h);
-}
-
-void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams filter_params,
- const int subpel_y_q4, int y_step_q4,
- ConvolveParams *conv_params) {
- __m128i verf[6];
- SubpelFilterCoeffs vCoeffs;
- const uint8_t *src_ptr;
- assert(conv_params->do_average == 0 || conv_params->do_average == 1);
- uint8_t *dst_ptr = dst;
- store_pixel_t store2p = store2pixelTab[conv_params->do_average];
- store_pixel_t store4p = store4pixelTab[conv_params->do_average];
- store_pixel_t store8p = store8pixelTab[conv_params->do_average];
- const int tapsNum = filter_params.taps;
-
- if (0 == subpel_y_q4 || 16 != y_step_q4) {
- av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
- subpel_y_q4, y_step_q4, conv_params);
- return;
- }
-
- vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
-
- if (!vCoeffs) {
- av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
- subpel_y_q4, y_step_q4, conv_params);
- return;
- }
-
- verf[0] = *((const __m128i *)(vCoeffs));
- verf[1] = *((const __m128i *)(vCoeffs + 1));
- verf[2] = *((const __m128i *)(vCoeffs + 2));
- verf[3] = *((const __m128i *)(vCoeffs + 3));
- verf[4] = *((const __m128i *)(vCoeffs + 4));
- verf[5] = *((const __m128i *)(vCoeffs + 5));
-
- src -= src_stride * ((tapsNum >> 1) - 1);
- src_ptr = src;
-
- if (w > 4) {
- filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
- dst_ptr, dst_stride);
- } else if (4 == w) {
- filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
- dst_ptr, dst_stride);
- } else if (2 == w) {
- filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
- dst_ptr, dst_stride);
- } else {
- assert(0);
- }
-}
-
-static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
- int8_t (*simd_horiz_filter)[2][16]) {
- int shift;
- int offset = (12 - taps) / 2;
- const int16_t *filter_row;
- for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
- int i;
- filter_row = filter_ptr + shift * taps;
- for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
- for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
-
- for (i = 0; i < taps; ++i) {
- simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
- simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
- }
-
- for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
-
- for (i = offset + 2 + taps; i < 16; ++i)
- simd_horiz_filter[shift - 1][1][i] = 0;
- }
-}
-
-static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
- int8_t (*simd_vert_filter)[6][16]) {
- int shift;
- int offset = (12 - taps) / 2;
- const int16_t *filter_row;
- for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
- int i;
- filter_row = filter_ptr + shift * taps;
- for (i = 0; i < 6; ++i) {
- int j;
- for (j = 0; j < 16; ++j) {
- int c = i * 2 + (j % 2) - offset;
- if (c >= 0 && c < taps)
- simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
- else
- simd_vert_filter[shift - 1][i][j] = 0;
- }
- }
- }
-}
-
-typedef struct SimdFilter {
- InterpFilter interp_filter;
- int8_t (*simd_horiz_filter)[2][16];
- int8_t (*simd_vert_filter)[6][16];
-} SimdFilter;
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-#define MULTITAP_FILTER_NUM 1
-SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
- { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
- &sub_pel_filters_12sharp_ver_signal_dir[0] },
-};
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-SimdFilter temporal_simd_filter = {
- TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
- &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
-};
-#endif
-
-void av1_lowbd_convolve_init_ssse3(void) {
-#if USE_TEMPORALFILTER_12TAP
- {
- InterpFilterParams filter_params =
- av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
- int taps = filter_params.taps;
- const int16_t *filter_ptr = filter_params.filter_ptr;
- init_simd_horiz_filter(filter_ptr, taps,
- temporal_simd_filter.simd_horiz_filter);
- init_simd_vert_filter(filter_ptr, taps,
- temporal_simd_filter.simd_vert_filter);
- }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
- {
- int i;
- for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
- InterpFilter interp_filter = simd_filters[i].interp_filter;
- InterpFilterParams filter_params =
- av1_get_interp_filter_params(interp_filter);
- int taps = filter_params.taps;
- const int16_t *filter_ptr = filter_params.filter_ptr;
- init_simd_horiz_filter(filter_ptr, taps,
- simd_filters[i].simd_horiz_filter);
- init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
- }
- }
-#endif
- return;
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
deleted file mode 100644
index 97d2e74b1..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ /dev/null
@@ -1,839 +0,0 @@
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 32;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[32];
- __m128i buf1[32];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- int j;
- for (j = 0; j < 32; ++j) {
- buf0[j] = input[j * col_num + col];
- }
-
- // stage 1
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
- buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
- buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
- buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
- buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
- buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
- buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
- buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
- buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
- buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
- buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
- buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
- buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
- buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
- buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
- buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
- buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
- buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
- buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
- buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
- buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
- buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
- buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
- buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
- buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
- buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
- buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
- buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
- buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
- buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
- buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
- buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
- buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
- buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
- buf0[16] = buf1[16];
- buf0[17] = buf1[17];
- buf0[18] = buf1[18];
- buf0[19] = buf1[19];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
- buf0[24], bit);
- buf0[28] = buf1[28];
- buf0[29] = buf1[29];
- buf0[30] = buf1[30];
- buf0[31] = buf1[31];
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
- buf1[8] = buf0[8];
- buf1[9] = buf0[9];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
- buf1[12], bit);
- buf1[14] = buf0[14];
- buf1[15] = buf0[15];
- buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
- buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
- buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
- buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
- buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
- buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
- buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
- buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
- buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
- buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
- buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
- buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
- buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
- buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
- buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
- buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
- buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
- buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
- buf0[4] = buf1[4];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
- buf0[7] = buf1[7];
- buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
- buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
- buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
- buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
- buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
- buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
- buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
- buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
- buf0[16] = buf1[16];
- buf0[17] = buf1[17];
- btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
- btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
- buf0[28], bit);
- btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- buf0[22] = buf1[22];
- buf0[23] = buf1[23];
- buf0[24] = buf1[24];
- buf0[25] = buf1[25];
- buf0[30] = buf1[30];
- buf0[31] = buf1[31];
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
- buf1[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
- buf1[3], bit);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
- buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
- buf1[8] = buf0[8];
- btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
- buf1[14], bit);
- btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
- buf1[11] = buf0[11];
- buf1[12] = buf0[12];
- buf1[15] = buf0[15];
- buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
- buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
- buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
- buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
- buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
- buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
- buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
- buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
- buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
- buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
- buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
- buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
- buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
- buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
- buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
- buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- bit);
- btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
- buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
- buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
- buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
- buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
- buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
- buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
- buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
- buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
- buf0[16] = buf1[16];
- btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
- buf0[30], bit);
- btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
- buf0[19] = buf1[19];
- buf0[20] = buf1[20];
- btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
- buf0[23] = buf1[23];
- buf0[24] = buf1[24];
- buf0[27] = buf1[27];
- buf0[28] = buf1[28];
- buf0[31] = buf1[31];
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf1[0] = buf0[0];
- buf1[1] = buf0[1];
- buf1[2] = buf0[2];
- buf1[3] = buf0[3];
- buf1[4] = buf0[4];
- buf1[5] = buf0[5];
- buf1[6] = buf0[6];
- buf1[7] = buf0[7];
- btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
- buf1[15], bit);
- btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
- buf1[14], bit);
- btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
- btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
- buf1[12], bit);
- buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
- buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
- buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
- buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
- buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
- buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
- buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
- buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
- buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
- buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
- buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
- buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
- buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
- buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
- buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
- buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
-
- // stage 8
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- buf0[10] = buf1[10];
- buf0[11] = buf1[11];
- buf0[12] = buf1[12];
- buf0[13] = buf1[13];
- buf0[14] = buf1[14];
- buf0[15] = buf1[15];
- btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
- buf0[31], bit);
- btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
- buf0[30], bit);
- btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
- btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
- buf0[28], bit);
- btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
- btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
- buf0[24], bit);
-
- // stage 9
- stage_idx++;
- buf1[0] = buf0[0];
- buf1[1] = buf0[16];
- buf1[2] = buf0[8];
- buf1[3] = buf0[24];
- buf1[4] = buf0[4];
- buf1[5] = buf0[20];
- buf1[6] = buf0[12];
- buf1[7] = buf0[28];
- buf1[8] = buf0[2];
- buf1[9] = buf0[18];
- buf1[10] = buf0[10];
- buf1[11] = buf0[26];
- buf1[12] = buf0[6];
- buf1[13] = buf0[22];
- buf1[14] = buf0[14];
- buf1[15] = buf0[30];
- buf1[16] = buf0[1];
- buf1[17] = buf0[17];
- buf1[18] = buf0[9];
- buf1[19] = buf0[25];
- buf1[20] = buf0[5];
- buf1[21] = buf0[21];
- buf1[22] = buf0[13];
- buf1[23] = buf0[29];
- buf1[24] = buf0[3];
- buf1[25] = buf0[19];
- buf1[26] = buf0[11];
- buf1[27] = buf0[27];
- buf1[28] = buf0[7];
- buf1[29] = buf0[23];
- buf1[30] = buf0[15];
- buf1[31] = buf0[31];
-
- for (j = 0; j < 32; ++j) {
- output[j * col_num + col] = buf1[j];
- }
- }
-}
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 4;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[4];
- __m128i buf1[4];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- int j;
- for (j = 0; j < 4; ++j) {
- buf0[j] = input[j * col_num + col];
- }
-
- // stage 1
- stage_idx++;
- buf1[0] = buf0[3];
- buf1[1] = buf0[0];
- buf1[2] = buf0[1];
- buf1[3] = buf0[2];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
-
- // stage 3
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
-
- // stage 5
- stage_idx++;
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[2] = buf0[3];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- for (j = 0; j < 4; ++j) {
- output[j * col_num + col] = buf1[j];
- }
- }
-}
-
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 32;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[32];
- __m128i buf1[32];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- int j;
- for (j = 0; j < 32; ++j) {
- buf0[j] = input[j * col_num + col];
- }
-
- // stage 1
- stage_idx++;
- buf1[0] = buf0[31];
- buf1[1] = buf0[0];
- buf1[2] = buf0[29];
- buf1[3] = buf0[2];
- buf1[4] = buf0[27];
- buf1[5] = buf0[4];
- buf1[6] = buf0[25];
- buf1[7] = buf0[6];
- buf1[8] = buf0[23];
- buf1[9] = buf0[8];
- buf1[10] = buf0[21];
- buf1[11] = buf0[10];
- buf1[12] = buf0[19];
- buf1[13] = buf0[12];
- buf1[14] = buf0[17];
- buf1[15] = buf0[14];
- buf1[16] = buf0[15];
- buf1[17] = buf0[16];
- buf1[18] = buf0[13];
- buf1[19] = buf0[18];
- buf1[20] = buf0[11];
- buf1[21] = buf0[20];
- buf1[22] = buf0[9];
- buf1[23] = buf0[22];
- buf1[24] = buf0[7];
- buf1[25] = buf0[24];
- buf1[26] = buf0[5];
- buf1[27] = buf0[26];
- buf1[28] = buf0[3];
- buf1[29] = buf0[28];
- buf1[30] = buf0[1];
- buf1[31] = buf0[30];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
- btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
- buf0[9], bit);
- btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
- btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
- buf0[17], bit);
- btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
- btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
-
- // stage 3
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
- buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
- buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
- buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
- buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
- buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
- buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
- buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
- buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
- buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
- buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
- buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
- buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
- buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
- buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
- buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
- buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
- buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
- buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
- buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- buf0[10] = buf1[10];
- buf0[11] = buf1[11];
- buf0[12] = buf1[12];
- buf0[13] = buf1[13];
- buf0[14] = buf1[14];
- buf0[15] = buf1[15];
- btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
- buf0[17], bit);
- btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
- btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
-
- // stage 5
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
- buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
- buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
- buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
- buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
- buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
- buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
- buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
- buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
- buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
- buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
- buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
- buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
- buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
- buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
- buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
- buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
- buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
- buf0[16] = buf1[16];
- buf0[17] = buf1[17];
- buf0[18] = buf1[18];
- buf0[19] = buf1[19];
- buf0[20] = buf1[20];
- buf0[21] = buf1[21];
- buf0[22] = buf1[22];
- buf0[23] = buf1[23];
- btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
-
- // stage 7
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
- buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
- buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
- buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
- buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
- buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
- buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
- buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
- buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
- buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
- buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
- buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
- buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
- buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
- buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
- buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
- buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
- buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
-
- // stage 8
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- buf0[10] = buf1[10];
- buf0[11] = buf1[11];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
- buf0[16] = buf1[16];
- buf0[17] = buf1[17];
- buf0[18] = buf1[18];
- buf0[19] = buf1[19];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- buf0[24] = buf1[24];
- buf0[25] = buf1[25];
- buf0[26] = buf1[26];
- buf0[27] = buf1[27];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
-
- // stage 9
- stage_idx++;
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
- buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
- buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
- buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
- buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
- buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
- buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
- buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
- buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
- buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
- buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
- buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
- buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
- buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
- buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
- buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
- buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
- buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
-
- // stage 10
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr(bit);
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- buf0[12] = buf1[12];
- buf0[13] = buf1[13];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
- buf0[16] = buf1[16];
- buf0[17] = buf1[17];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
- buf0[20] = buf1[20];
- buf0[21] = buf1[21];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- buf0[24] = buf1[24];
- buf0[25] = buf1[25];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- buf0[28] = buf1[28];
- buf0[29] = buf1[29];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
-
- // stage 11
- stage_idx++;
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
- buf1[2] = buf0[24];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
- buf1[4] = buf0[12];
- buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
- buf1[6] = buf0[20];
- buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
- buf1[8] = buf0[6];
- buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
- buf1[10] = buf0[30];
- buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
- buf1[12] = buf0[10];
- buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
- buf1[14] = buf0[18];
- buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[16] = buf0[3];
- buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
- buf1[18] = buf0[27];
- buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
- buf1[20] = buf0[15];
- buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
- buf1[22] = buf0[23];
- buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
- buf1[24] = buf0[5];
- buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
- buf1[26] = buf0[29];
- buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
- buf1[28] = buf0[9];
- buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
- buf1[30] = buf0[17];
- buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- for (j = 0; j < 32; ++j) {
- output[j * col_num + col] = buf1[j];
- }
- }
-}
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
deleted file mode 100644
index 58ede028a..000000000
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/enums.h"
-#include "av1/common/av1_txfm.h"
-#include "av1/common/x86/av1_txfm1d_sse4.h"
-
-static INLINE void int16_array_with_stride_to_int32_array_without_stride(
- const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
- int r, c;
- for (r = 0; r < txfm1d_size; r++) {
- for (c = 0; c < txfm1d_size; c++) {
- output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
- }
- }
-}
-
-typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
- switch (txfm_type) {
- case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
- case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
- default: assert(0);
- }
- return NULL;
-}
-
-static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
- const int stride,
- const TXFM_2D_FLIP_CFG *cfg,
- int32_t *txfm_buf) {
- // TODO(sarahparker) This does not currently support rectangular transforms
- // and will break without splitting txfm_size out into row and col size.
- // Rectangular transforms use c code only, so it should be ok for now.
- // It will be corrected when there are sse implementations for rectangular
- // transforms.
- assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
- const int txfm_size = cfg->row_cfg->txfm_size;
- const int8_t *shift = cfg->row_cfg->shift;
- const int8_t *stage_range_col = cfg->col_cfg->stage_range;
- const int8_t *stage_range_row = cfg->row_cfg->stage_range;
- const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
- const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
- const TxfmFuncSSE2 txfm_func_col =
- fwd_txfm_type_to_func(cfg->col_cfg->txfm_type);
- const TxfmFuncSSE2 txfm_func_row =
- fwd_txfm_type_to_func(cfg->row_cfg->txfm_type);
-
- __m128i *buf_128 = (__m128i *)txfm_buf;
- __m128i *out_128 = (__m128i *)output;
- int num_per_128 = 4;
- int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
-
- int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
- txfm_size);
- round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
- txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
- round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
- transpose_32(txfm_size, out_128, buf_128);
- txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
- round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
- transpose_32(txfm_size, buf_128, out_128);
-}
-
-void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
- int stride, TX_TYPE tx_type, int bd) {
- DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
- TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
- (void)bd;
- fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
-}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
index 68461bc36..212d3bd72 100644
--- a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -12,81 +12,14 @@
#include <assert.h>
#include <smmintrin.h>
-#include "./av1_rtcd.h"
-#include "av1/common/filter.h"
-
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
-DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
-#endif
-
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
-#endif
+#include "config/av1_rtcd.h"
-typedef int16_t (*HbdSubpelFilterCoeffs)[8];
+#include "av1/common/filter.h"
typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
int src_stride, uint16_t *dst, int dst_stride,
int bd);
-static INLINE HbdSubpelFilterCoeffs
-hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
- if (p.interp_filter == MULTITAP_SHARP) {
- return &subpel_filters_sharp[index][0];
- }
-#endif
-#if USE_TEMPORALFILTER_12TAP
- if (p.interp_filter == TEMPORALFILTER_12TAP) {
- return &subpel_temporalfilter[index][0];
- }
-#endif
- (void)p;
- (void)index;
- return NULL;
-}
-
-static void init_simd_filter(const int16_t *filter_ptr, int taps,
- int16_t (*simd_filter)[6][8]) {
- int shift;
- int offset = (12 - taps) / 2;
- for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
- const int16_t *filter_row = filter_ptr + shift * taps;
- int i, j;
- for (i = 0; i < 12; ++i) {
- for (j = 0; j < 4; ++j) {
- int r = i / 2;
- int c = j * 2 + (i % 2);
- if (i - offset >= 0 && i - offset < taps)
- simd_filter[shift - 1][r][c] = filter_row[i - offset];
- else
- simd_filter[shift - 1][r][c] = 0;
- }
- }
- }
-}
-
-void av1_highbd_convolve_init_sse4_1(void) {
-#if USE_TEMPORALFILTER_12TAP
- {
- InterpFilterParams filter_params =
- av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
- int taps = filter_params.taps;
- const int16_t *filter_ptr = filter_params.filter_ptr;
- init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
- }
-#endif
-#if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
- {
- InterpFilterParams filter_params =
- av1_get_interp_filter_params(MULTITAP_SHARP);
- int taps = filter_params.taps;
- const int16_t *filter_ptr = filter_params.filter_ptr;
- init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
- }
-#endif
-}
-
// pixelsNum 0: write all 4 pixels
// 1/2/3: residual pixels 1/2/3
static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
@@ -218,138 +151,6 @@ void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
writePixel(u, width, pixelsNum, dst, dst_stride);
}
-static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
-
-static INLINE void transpose_pair(__m128i *in, __m128i *out) {
- __m128i x0, x1;
-
- x0 = _mm_unpacklo_epi32(in[0], in[1]);
- x1 = _mm_unpacklo_epi32(in[2], in[3]);
-
- out[0] = _mm_unpacklo_epi64(x0, x1);
- out[1] = _mm_unpackhi_epi64(x0, x1);
-
- x0 = _mm_unpackhi_epi32(in[0], in[1]);
- x1 = _mm_unpackhi_epi32(in[2], in[3]);
-
- out[2] = _mm_unpacklo_epi64(x0, x1);
- out[3] = _mm_unpackhi_epi64(x0, x1);
-
- x0 = _mm_unpacklo_epi32(in[4], in[5]);
- x1 = _mm_unpacklo_epi32(in[6], in[7]);
-
- out[4] = _mm_unpacklo_epi64(x0, x1);
- out[5] = _mm_unpackhi_epi64(x0, x1);
-}
-
-static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
- int tapsNum, uint32_t *buf) {
- __m128i u[8], v[6];
-
- assert(tapsNum == 10 || tapsNum == 12);
- if (tapsNum == 10) {
- src -= 1;
- }
-
- u[0] = _mm_loadu_si128((__m128i const *)src);
- u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
- u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
- u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
- u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
- u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
- u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
-
- transpose_pair(u, v);
-
- u[0] = _mm_madd_epi16(v[0], f[0]);
- u[1] = _mm_madd_epi16(v[1], f[1]);
- u[2] = _mm_madd_epi16(v[2], f[2]);
- u[3] = _mm_madd_epi16(v[3], f[3]);
- u[4] = _mm_madd_epi16(v[4], f[4]);
- u[5] = _mm_madd_epi16(v[5], f[5]);
-
- u[6] = _mm_min_epi32(u[2], u[3]);
- u[7] = _mm_max_epi32(u[2], u[3]);
-
- u[0] = _mm_add_epi32(u[0], u[1]);
- u[0] = _mm_add_epi32(u[0], u[5]);
- u[0] = _mm_add_epi32(u[0], u[4]);
- u[0] = _mm_add_epi32(u[0], u[6]);
- u[0] = _mm_add_epi32(u[0], u[7]);
-
- _mm_storeu_si128((__m128i *)buf, u[0]);
-}
-
-void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h,
- const InterpFilterParams filter_params,
- const int subpel_x_q4, int x_step_q4,
- int avg, int bd) {
- DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
- __m128i verf[6];
- HbdSubpelFilterCoeffs vCoeffs;
- const uint16_t *srcPtr;
- const int tapsNum = filter_params.taps;
- int i, col, count, blkResidu, blkHeight;
- TransposeSave transSave = transSaveTab[avg];
- (void)x_step_q4;
-
- if (0 == subpel_x_q4 || 16 != x_step_q4) {
- av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
- filter_params, subpel_x_q4, x_step_q4, avg, bd);
- return;
- }
-
- vCoeffs =
- hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
- if (!vCoeffs) {
- av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
- filter_params, subpel_x_q4, x_step_q4, avg, bd);
- return;
- }
-
- verf[0] = *((const __m128i *)(vCoeffs));
- verf[1] = *((const __m128i *)(vCoeffs + 1));
- verf[2] = *((const __m128i *)(vCoeffs + 2));
- verf[3] = *((const __m128i *)(vCoeffs + 3));
- verf[4] = *((const __m128i *)(vCoeffs + 4));
- verf[5] = *((const __m128i *)(vCoeffs + 5));
-
- src -= (tapsNum >> 1) - 1;
- srcPtr = src;
-
- count = 0;
- blkHeight = h >> 2;
- blkResidu = h & 3;
-
- while (blkHeight != 0) {
- for (col = 0; col < w; col += 4) {
- for (i = 0; i < 4; ++i) {
- highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
- srcPtr += 1;
- }
- transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
- }
- count++;
- srcPtr = src + count * src_stride * 4;
- dst += dst_stride * 4;
- blkHeight--;
- }
-
- if (blkResidu == 0) return;
-
- for (col = 0; col < w; col += 4) {
- for (i = 0; i < 4; ++i) {
- highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
- srcPtr += 1;
- }
- transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
- }
-}
-
// Vertical convolutional filter
typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
@@ -402,134 +203,3 @@ static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
}
WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
-
-static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
- const __m128i *f, int taps,
- uint16_t *dst, WritePixels saveFunc,
- int bd) {
- __m128i s[12];
- __m128i zero = _mm_setzero_si128();
- int i = 0;
- int r = 0;
-
- // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
- assert(taps == 10 || taps == 12);
- if (10 == taps) {
- i += 1;
- s[0] = zero;
- }
- while (i < 12) {
- s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
- i += 1;
- r += 1;
- }
-
- s[0] = _mm_unpacklo_epi16(s[0], s[1]);
- s[2] = _mm_unpacklo_epi16(s[2], s[3]);
- s[4] = _mm_unpacklo_epi16(s[4], s[5]);
- s[6] = _mm_unpacklo_epi16(s[6], s[7]);
- s[8] = _mm_unpacklo_epi16(s[8], s[9]);
- s[10] = _mm_unpacklo_epi16(s[10], s[11]);
-
- s[0] = _mm_madd_epi16(s[0], f[0]);
- s[2] = _mm_madd_epi16(s[2], f[1]);
- s[4] = _mm_madd_epi16(s[4], f[2]);
- s[6] = _mm_madd_epi16(s[6], f[3]);
- s[8] = _mm_madd_epi16(s[8], f[4]);
- s[10] = _mm_madd_epi16(s[10], f[5]);
-
- s[1] = _mm_min_epi32(s[4], s[6]);
- s[3] = _mm_max_epi32(s[4], s[6]);
-
- s[0] = _mm_add_epi32(s[0], s[2]);
- s[0] = _mm_add_epi32(s[0], s[10]);
- s[0] = _mm_add_epi32(s[0], s[8]);
- s[0] = _mm_add_epi32(s[0], s[1]);
- s[0] = _mm_add_epi32(s[0], s[3]);
-
- saveFunc(s, bd, dst);
-}
-
-static void highbd_filter_vert_compute_large(const uint16_t *src,
- int src_stride, const __m128i *f,
- int taps, int w, int h,
- uint16_t *dst, int dst_stride,
- int avg, int bd) {
- int col;
- int rowIndex = 0;
- const uint16_t *src_ptr = src;
- uint16_t *dst_ptr = dst;
- const int step = 4;
- WritePixels write4pixels = write4pixelsTab[avg];
-
- do {
- for (col = 0; col < w; col += step) {
- filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
- write4pixels, bd);
- src_ptr += step;
- dst_ptr += step;
- }
- rowIndex++;
- src_ptr = src + rowIndex * src_stride;
- dst_ptr = dst + rowIndex * dst_stride;
- } while (rowIndex < h);
-}
-
-static void highbd_filter_vert_compute_small(const uint16_t *src,
- int src_stride, const __m128i *f,
- int taps, int w, int h,
- uint16_t *dst, int dst_stride,
- int avg, int bd) {
- int rowIndex = 0;
- WritePixels write2pixels = write2pixelsTab[avg];
- (void)w;
-
- do {
- filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
- rowIndex++;
- src += src_stride;
- dst += dst_stride;
- } while (rowIndex < h);
-}
-
-void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w,
- int h,
- const InterpFilterParams filter_params,
- const int subpel_y_q4, int y_step_q4,
- int avg, int bd) {
- __m128i verf[6];
- HbdSubpelFilterCoeffs vCoeffs;
- const int tapsNum = filter_params.taps;
-
- if (0 == subpel_y_q4 || 16 != y_step_q4) {
- av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
- filter_params, subpel_y_q4, y_step_q4, avg, bd);
- return;
- }
-
- vCoeffs =
- hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
- if (!vCoeffs) {
- av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
- filter_params, subpel_y_q4, y_step_q4, avg, bd);
- return;
- }
-
- verf[0] = *((const __m128i *)(vCoeffs));
- verf[1] = *((const __m128i *)(vCoeffs + 1));
- verf[2] = *((const __m128i *)(vCoeffs + 2));
- verf[3] = *((const __m128i *)(vCoeffs + 3));
- verf[4] = *((const __m128i *)(vCoeffs + 4));
- verf[5] = *((const __m128i *)(vCoeffs + 5));
-
- src -= src_stride * ((tapsNum >> 1) - 1);
-
- if (w > 2) {
- highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
- dst_stride, avg, bd);
- } else {
- highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
- dst_stride, avg, bd);
- }
-}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 000000000..7415c58df
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x1[0], x1[3]);
+ btf_16_adds_subs_avx2(x1[1], x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x1[5], x1[6]);
+
+ btf_16_adds_subs_avx2(x1[8], x1[11]);
+ btf_16_adds_subs_avx2(x1[9], x1[10]);
+ btf_16_subs_adds_avx2(x1[15], x1[12]);
+ btf_16_subs_adds_avx2(x1[14], x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x[0], x[7]);
+ btf_16_adds_subs_avx2(x[1], x[6]);
+ btf_16_adds_subs_avx2(x[2], x[5]);
+ btf_16_adds_subs_avx2(x[3], x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+ btf_16_adds_subs_out_avx2(output[0], output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(output[1], output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(output[2], output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(output[3], output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(output[4], output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(output[5], output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(output[6], output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(output[7], output[8], x1[7], x1[8]);
+}
+
+static void idct16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = input[8];
+ x1[2] = input[4];
+ x1[3] = input[12];
+ x1[4] = input[2];
+ x1[5] = input[10];
+ x1[6] = input[6];
+ x1[7] = input[14];
+ x1[8] = input[1];
+ x1[9] = input[9];
+ x1[10] = input[5];
+ x1[11] = input[13];
+ x1[12] = input[3];
+ x1[13] = input[11];
+ x1[14] = input[7];
+ x1[15] = input[15];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(x1[8], x1[9]);
+ btf_16_subs_adds_avx2(x1[11], x1[10]);
+ btf_16_adds_subs_avx2(x1[12], x1[13]);
+ btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(x1[4], x1[5]);
+ btf_16_subs_adds_avx2(x1[7], x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+ idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[2] = input[4];
+ x1[4] = input[2];
+ x1[6] = input[6];
+ x1[8] = input[1];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(x1[8], x1[9]);
+ btf_16_subs_adds_avx2(x1[11], x1[10]);
+ btf_16_adds_subs_avx2(x1[12], x1[13]);
+ btf_16_subs_adds_avx2(x1[15], x1[14]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(x1[4], x1[5]);
+ btf_16_subs_adds_avx2(x1[7], x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x1[9], x1[14], x1[9], x1[14]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x1[10], x1[13], x1[10], x1[13]);
+
+ idct16_stage5_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x1[2];
+ x1[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+ // stage 5
+ // stage 6
+ output[0] = x1[0];
+ output[1] = x1[1];
+ output[2] = x1[1];
+ output[3] = x1[0];
+ output[4] = x1[0];
+ output[5] = x1[1];
+ output[6] = x1[1];
+ output[7] = x1[0];
+ output[8] = x1[0];
+ output[9] = x1[1];
+ output[10] = x1[1];
+ output[11] = x1[0];
+ output[12] = x1[0];
+ output[13] = x1[1];
+ output[14] = x1[1];
+ output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(x[0], x[8]);
+ btf_16_adds_subs_avx2(x[1], x[9]);
+ btf_16_adds_subs_avx2(x[2], x[10]);
+ btf_16_adds_subs_avx2(x[3], x[11]);
+ btf_16_adds_subs_avx2(x[4], x[12]);
+ btf_16_adds_subs_avx2(x[5], x[13]);
+ btf_16_adds_subs_avx2(x[6], x[14]);
+ btf_16_adds_subs_avx2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(x[0], x[4]);
+ btf_16_adds_subs_avx2(x[1], x[5]);
+ btf_16_adds_subs_avx2(x[2], x[6]);
+ btf_16_adds_subs_avx2(x[3], x[7]);
+ btf_16_adds_subs_avx2(x[8], x[12]);
+ btf_16_adds_subs_avx2(x[9], x[13]);
+ btf_16_adds_subs_avx2(x[10], x[14]);
+ btf_16_adds_subs_avx2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(x[0], x[2]);
+ btf_16_adds_subs_avx2(x[1], x[3]);
+ btf_16_adds_subs_avx2(x[4], x[6]);
+ btf_16_adds_subs_avx2(x[5], x[7]);
+ btf_16_adds_subs_avx2(x[8], x[10]);
+ btf_16_adds_subs_avx2(x[9], x[11]);
+ btf_16_adds_subs_avx2(x[12], x[14]);
+ btf_16_adds_subs_avx2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x1[2], x1[3]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x1[6], x1[7]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x1[10], x1[11]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x1[14], x1[15]);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+ const __m256i __zero = _mm256_setzero_si256();
+ output[0] = x1[0];
+ output[1] = _mm256_subs_epi16(__zero, x1[8]);
+ output[2] = x1[12];
+ output[3] = _mm256_subs_epi16(__zero, x1[4]);
+ output[4] = x1[6];
+ output[5] = _mm256_subs_epi16(__zero, x1[14]);
+ output[6] = x1[10];
+ output[7] = _mm256_subs_epi16(__zero, x1[2]);
+ output[8] = x1[3];
+ output[9] = _mm256_subs_epi16(__zero, x1[11]);
+ output[10] = x1[15];
+ output[11] = _mm256_subs_epi16(__zero, x1[7]);
+ output[12] = x1[5];
+ output[13] = _mm256_subs_epi16(__zero, x1[13]);
+ output[14] = x1[9];
+ output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[15];
+ x1[1] = input[0];
+ x1[2] = input[13];
+ x1[3] = input[2];
+ x1[4] = input[11];
+ x1[5] = input[4];
+ x1[6] = input[9];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[9] = input[8];
+ x1[10] = input[5];
+ x1[11] = input[10];
+ x1[12] = input[3];
+ x1[13] = input[12];
+ x1[14] = input[1];
+ x1[15] = input[14];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, x1[0], x1[1], x1[0], x1[1]);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, x1[2], x1[3], x1[2], x1[3]);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, x1[4], x1[5], x1[4], x1[5]);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, x1[6], x1[7], x1[6], x1[7]);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, x1[8], x1[9], x1[8], x1[9]);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, x1[10], x1[11], x1[10], x1[11]);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, x1[12], x1[13], x1[12], x1[13]);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, x1[14], x1[15], x1[14], x1[15]);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+ x1[3] = input[2];
+ x1[5] = input[4];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[1];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+ btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+ btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+ btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+ btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+ btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+ btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+ // stage 3
+ x1[8] = x1[0];
+ x1[9] = x1[1];
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, x1[8], x1[9], x1[8], x1[9]);
+
+ // stage 5
+ x1[4] = x1[0];
+ x1[5] = x1[1];
+
+ x1[12] = x1[8];
+ x1[13] = x1[9];
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[4], x1[5], x1[4], x1[5]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, x1[12], x1[13], x1[12], x1[13]);
+
+ // stage 7
+ x1[2] = x1[0];
+ x1[3] = x1[1];
+ x1[6] = x1[4];
+ x1[7] = x1[5];
+ x1[10] = x1[8];
+ x1[11] = x1[9];
+ x1[14] = x1[12];
+ x1[15] = x1[13];
+
+ iadst16_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(x[16], x[17]);
+ btf_16_subs_adds_avx2(x[19], x[18]);
+ btf_16_adds_subs_avx2(x[20], x[21]);
+ btf_16_subs_adds_avx2(x[23], x[22]);
+ btf_16_adds_subs_avx2(x[24], x[25]);
+ btf_16_subs_adds_avx2(x[27], x[26]);
+ btf_16_adds_subs_avx2(x[28], x[29]);
+ btf_16_subs_adds_avx2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ btf_16_adds_subs_avx2(x[16], x[19]);
+ btf_16_adds_subs_avx2(x[17], x[18]);
+ btf_16_subs_adds_avx2(x[23], x[20]);
+ btf_16_subs_adds_avx2(x[22], x[21]);
+ btf_16_adds_subs_avx2(x[24], x[27]);
+ btf_16_adds_subs_avx2(x[25], x[26]);
+ btf_16_subs_adds_avx2(x[31], x[28]);
+ btf_16_subs_adds_avx2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(x[8], x[11]);
+ btf_16_adds_subs_avx2(x[9], x[10]);
+ btf_16_subs_adds_avx2(x[15], x[12]);
+ btf_16_subs_adds_avx2(x[14], x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x[0], x[7]);
+ btf_16_adds_subs_avx2(x[1], x[6]);
+ btf_16_adds_subs_avx2(x[2], x[5]);
+ btf_16_adds_subs_avx2(x[3], x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(x[16], x[23]);
+ btf_16_adds_subs_avx2(x[17], x[22]);
+ btf_16_adds_subs_avx2(x[18], x[21]);
+ btf_16_adds_subs_avx2(x[19], x[20]);
+ btf_16_subs_adds_avx2(x[31], x[24]);
+ btf_16_subs_adds_avx2(x[30], x[25]);
+ btf_16_subs_adds_avx2(x[29], x[26]);
+ btf_16_subs_adds_avx2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x[0], x[15]);
+ btf_16_adds_subs_avx2(x[1], x[14]);
+ btf_16_adds_subs_avx2(x[2], x[13]);
+ btf_16_adds_subs_avx2(x[3], x[12]);
+ btf_16_adds_subs_avx2(x[4], x[11]);
+ btf_16_adds_subs_avx2(x[5], x[10]);
+ btf_16_adds_subs_avx2(x[6], x[9]);
+ btf_16_adds_subs_avx2(x[7], x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(output[0], output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(output[1], output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(output[2], output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(output[3], output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(output[4], output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(output[5], output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(output[6], output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(output[7], output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(output[8], output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(output[9], output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(output[10], output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(output[11], output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(output[12], output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(output[13], output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(output[14], output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_avx2(x);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(x[8], x[9]);
+ btf_16_subs_adds_avx2(x[11], x[10]);
+ btf_16_adds_subs_avx2(x[12], x[13]);
+ btf_16_subs_adds_avx2(x[15], x[14]);
+ idct32_high16_stage4_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(x[4], x[5]);
+ btf_16_subs_adds_avx2(x[7], x[6]);
+ idct32_high24_stage5_avx2(x, cospi, __rounding, cos_bit);
+
+ btf_16_adds_subs_avx2(x[0], x[3]);
+ btf_16_adds_subs_avx2(x[1], x[2]);
+ idct32_high28_stage6_avx2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_avx2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m256i x1[32];
+ x1[0] = input[0];
+ x1[1] = input[16];
+ x1[2] = input[8];
+ x1[3] = input[24];
+ x1[4] = input[4];
+ x1[5] = input[20];
+ x1[6] = input[12];
+ x1[7] = input[28];
+ x1[8] = input[2];
+ x1[9] = input[18];
+ x1[10] = input[10];
+ x1[11] = input[26];
+ x1[12] = input[6];
+ x1[13] = input[22];
+ x1[14] = input[14];
+ x1[15] = input[30];
+ x1[16] = input[1];
+ x1[17] = input[17];
+ x1[18] = input[9];
+ x1[19] = input[25];
+ x1[20] = input[5];
+ x1[21] = input[21];
+ x1[22] = input[13];
+ x1[23] = input[29];
+ x1[24] = input[3];
+ x1[25] = input[19];
+ x1[26] = input[11];
+ x1[27] = input[27];
+ x1[28] = input[7];
+ x1[29] = input[23];
+ x1[30] = input[15];
+ x1[31] = input[31];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, x1[16], x1[31], x1[16], x1[31]);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, x1[17], x1[30], x1[17], x1[30]);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, x1[18], x1[29], x1[18], x1[29]);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, x1[19], x1[28], x1[19], x1[28]);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, x1[20], x1[27], x1[20], x1[27]);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, x1[21], x1[26], x1[21], x1[26]);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, x1[22], x1[25], x1[22], x1[25]);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, x1[23], x1[24], x1[23], x1[24]);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, x1[8], x1[15], x1[8], x1[15]);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, x1[9], x1[14], x1[9], x1[14]);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, x1[10], x1[13], x1[10], x1[13]);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, x1[11], x1[12], x1[11], x1[12]);
+ idct32_high16_stage3_avx2(x1);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, x1[4], x1[7], x1[4], x1[7]);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, x1[5], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(x1[8], x1[9]);
+ btf_16_subs_adds_avx2(x1[11], x1[10]);
+ btf_16_adds_subs_avx2(x1[12], x1[13]);
+ btf_16_subs_adds_avx2(x1[15], x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x1[0], x1[1]);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, x1[2], x1[3], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(x1[4], x1[5]);
+ btf_16_subs_adds_avx2(x1[7], x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_avx2(x1[0], x1[3]);
+ btf_16_adds_subs_avx2(x1[1], x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, __rounding, cos_bit);
+
+ idct32_stage7_avx2(x1, cospi, __rounding, cos_bit);
+ idct32_stage8_avx2(x1, cospi, __rounding, cos_bit);
+ idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_adds_subs_avx2(x[32], x[35]);
+ btf_16_adds_subs_avx2(x[33], x[34]);
+ btf_16_subs_adds_avx2(x[39], x[36]);
+ btf_16_subs_adds_avx2(x[38], x[37]);
+ btf_16_adds_subs_avx2(x[40], x[43]);
+ btf_16_adds_subs_avx2(x[41], x[42]);
+ btf_16_subs_adds_avx2(x[47], x[44]);
+ btf_16_subs_adds_avx2(x[46], x[45]);
+ btf_16_adds_subs_avx2(x[48], x[51]);
+ btf_16_adds_subs_avx2(x[49], x[50]);
+ btf_16_subs_adds_avx2(x[55], x[52]);
+ btf_16_subs_adds_avx2(x[54], x[53]);
+ btf_16_adds_subs_avx2(x[56], x[59]);
+ btf_16_adds_subs_avx2(x[57], x[58]);
+ btf_16_subs_adds_avx2(x[63], x[60]);
+ btf_16_subs_adds_avx2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ btf_16_adds_subs_avx2(x[16], x[19]);
+ btf_16_adds_subs_avx2(x[17], x[18]);
+ btf_16_subs_adds_avx2(x[23], x[20]);
+ btf_16_subs_adds_avx2(x[22], x[21]);
+ btf_16_adds_subs_avx2(x[24], x[27]);
+ btf_16_adds_subs_avx2(x[25], x[26]);
+ btf_16_subs_adds_avx2(x[31], x[28]);
+ btf_16_subs_adds_avx2(x[30], x[29]);
+ idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_adds_subs_avx2(x[32], x[39]);
+ btf_16_adds_subs_avx2(x[33], x[38]);
+ btf_16_adds_subs_avx2(x[34], x[37]);
+ btf_16_adds_subs_avx2(x[35], x[36]);
+ btf_16_subs_adds_avx2(x[47], x[40]);
+ btf_16_subs_adds_avx2(x[46], x[41]);
+ btf_16_subs_adds_avx2(x[45], x[42]);
+ btf_16_subs_adds_avx2(x[44], x[43]);
+ btf_16_adds_subs_avx2(x[48], x[55]);
+ btf_16_adds_subs_avx2(x[49], x[54]);
+ btf_16_adds_subs_avx2(x[50], x[53]);
+ btf_16_adds_subs_avx2(x[51], x[52]);
+ btf_16_subs_adds_avx2(x[63], x[56]);
+ btf_16_subs_adds_avx2(x[62], x[57]);
+ btf_16_subs_adds_avx2(x[61], x[58]);
+ btf_16_subs_adds_avx2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_avx2(x[16], x[23]);
+ btf_16_adds_subs_avx2(x[17], x[22]);
+ btf_16_adds_subs_avx2(x[18], x[21]);
+ btf_16_adds_subs_avx2(x[19], x[20]);
+ btf_16_subs_adds_avx2(x[31], x[24]);
+ btf_16_subs_adds_avx2(x[30], x[25]);
+ btf_16_subs_adds_avx2(x[29], x[26]);
+ btf_16_subs_adds_avx2(x[28], x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x[0], x[15]);
+ btf_16_adds_subs_avx2(x[1], x[14]);
+ btf_16_adds_subs_avx2(x[2], x[13]);
+ btf_16_adds_subs_avx2(x[3], x[12]);
+ btf_16_adds_subs_avx2(x[4], x[11]);
+ btf_16_adds_subs_avx2(x[5], x[10]);
+ btf_16_adds_subs_avx2(x[6], x[9]);
+ btf_16_adds_subs_avx2(x[7], x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(x[32], x[47]);
+ btf_16_adds_subs_avx2(x[33], x[46]);
+ btf_16_adds_subs_avx2(x[34], x[45]);
+ btf_16_adds_subs_avx2(x[35], x[44]);
+ btf_16_adds_subs_avx2(x[36], x[43]);
+ btf_16_adds_subs_avx2(x[37], x[42]);
+ btf_16_adds_subs_avx2(x[38], x[41]);
+ btf_16_adds_subs_avx2(x[39], x[40]);
+ btf_16_subs_adds_avx2(x[63], x[48]);
+ btf_16_subs_adds_avx2(x[62], x[49]);
+ btf_16_subs_adds_avx2(x[61], x[50]);
+ btf_16_subs_adds_avx2(x[60], x[51]);
+ btf_16_subs_adds_avx2(x[59], x[52]);
+ btf_16_subs_adds_avx2(x[58], x[53]);
+ btf_16_subs_adds_avx2(x[57], x[54]);
+ btf_16_subs_adds_avx2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i __rounding,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(x[0], x[31]);
+ btf_16_adds_subs_avx2(x[1], x[30]);
+ btf_16_adds_subs_avx2(x[2], x[29]);
+ btf_16_adds_subs_avx2(x[3], x[28]);
+ btf_16_adds_subs_avx2(x[4], x[27]);
+ btf_16_adds_subs_avx2(x[5], x[26]);
+ btf_16_adds_subs_avx2(x[6], x[25]);
+ btf_16_adds_subs_avx2(x[7], x[24]);
+ btf_16_adds_subs_avx2(x[8], x[23]);
+ btf_16_adds_subs_avx2(x[9], x[22]);
+ btf_16_adds_subs_avx2(x[10], x[21]);
+ btf_16_adds_subs_avx2(x[11], x[20]);
+ btf_16_adds_subs_avx2(x[12], x[19]);
+ btf_16_adds_subs_avx2(x[13], x[18]);
+ btf_16_adds_subs_avx2(x[14], x[17]);
+ btf_16_adds_subs_avx2(x[15], x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(output[0], output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(output[1], output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(output[2], output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(output[3], output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(output[4], output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(output[5], output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(output[6], output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(output[7], output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(output[8], output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(output[9], output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(output[10], output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(output[11], output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(output[12], output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(output[13], output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(output[14], output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(output[15], output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(output[16], output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(output[17], output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(output[18], output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(output[19], output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(output[20], output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(output[21], output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(output[22], output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(output[23], output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(output[24], output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(output[25], output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(output[26], output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(output[27], output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(output[28], output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(output[29], output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(output[30], output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ x[9] = x[9];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(x[8], x[11]);
+ btf_16_adds_subs_avx2(x[9], x[10]);
+ btf_16_subs_adds_avx2(x[15], x[12]);
+ btf_16_subs_adds_avx2(x[14], x[13]);
+ idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_avx2(x[0], x[7]);
+ btf_16_adds_subs_avx2(x[1], x[6]);
+ btf_16_adds_subs_avx2(x[2], x[5]);
+ btf_16_adds_subs_avx2(x[3], x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(x[32], x[33]);
+ btf_16_subs_adds_avx2(x[35], x[34]);
+ btf_16_adds_subs_avx2(x[36], x[37]);
+ btf_16_subs_adds_avx2(x[39], x[38]);
+ btf_16_adds_subs_avx2(x[40], x[41]);
+ btf_16_subs_adds_avx2(x[43], x[42]);
+ btf_16_adds_subs_avx2(x[44], x[45]);
+ btf_16_subs_adds_avx2(x[47], x[46]);
+ btf_16_adds_subs_avx2(x[48], x[49]);
+ btf_16_subs_adds_avx2(x[51], x[50]);
+ btf_16_adds_subs_avx2(x[52], x[53]);
+ btf_16_subs_adds_avx2(x[55], x[54]);
+ btf_16_adds_subs_avx2(x[56], x[57]);
+ btf_16_subs_adds_avx2(x[59], x[58]);
+ btf_16_adds_subs_avx2(x[60], x[61]);
+ btf_16_subs_adds_avx2(x[63], x[62]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(x[16], x[17]);
+ btf_16_subs_adds_avx2(x[19], x[18]);
+ btf_16_adds_subs_avx2(x[20], x[21]);
+ btf_16_subs_adds_avx2(x[23], x[22]);
+ btf_16_adds_subs_avx2(x[24], x[25]);
+ btf_16_subs_adds_avx2(x[27], x[26]);
+ btf_16_adds_subs_avx2(x[28], x[29]);
+ btf_16_subs_adds_avx2(x[31], x[30]);
+ idct64_stage4_high32_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(x[8], x[9]);
+ btf_16_subs_adds_avx2(x[11], x[10]);
+ btf_16_adds_subs_avx2(x[12], x[13]);
+ btf_16_subs_adds_avx2(x[15], x[14]);
+ idct64_stage5_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(x[4], x[5]);
+ btf_16_subs_adds_avx2(x[7], x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(x[0], x[3]);
+ btf_16_adds_subs_avx2(x[1], x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(x[8], x[11]);
+ btf_16_adds_subs_avx2(x[9], x[10]);
+ btf_16_subs_adds_avx2(x[15], x[12]);
+ btf_16_subs_adds_avx2(x[14], x[13]);
+ idct64_stage7_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_avx2(x[0], x[7]);
+ btf_16_adds_subs_avx2(x[1], x[6]);
+ btf_16_adds_subs_avx2(x[2], x[5]);
+ btf_16_adds_subs_avx2(x[3], x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_avx2(x, cospi, __rounding, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_avx2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+ lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
+ { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
+ idct32_new_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
+ idct64_low32_new_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+ const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+ __m256i buf0[64];
+ const int32_t *input_row = input + (i << 4) * input_stride;
+ for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ const int32_t *input_cur = input_row + j * 16;
+ load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+ 16);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+
+ __m256i *buf1_cur = buf1 + (i << 4);
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_av2(buf0 + 16 * j, temp, 16);
+ int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+ transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i *buf1_cur = buf1 + i * txfm_size_row;
+ col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+ stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m256i rounding = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale_rounding = _mm256_unpacklo_epi16(scale, rounding);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale_rounding);
+ hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m256i rect_scale =
+ _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ src = _mm256_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale_rounding);
+ hi = _mm256_madd_epi16(hi, scale_rounding);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+ __m256i *buf, int shift, int height,
+ int txh_idx) {
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m256i scale_rounding = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift_rounding = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale_rounding);
+ for (int h = 0; h < height; ++h) {
+ __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+ __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+ lo = _mm256_madd_epi16(lo, scale_coeff);
+ hi = _mm256_madd_epi16(hi, scale_coeff);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm256_add_epi32(lo, shift_rounding);
+ hi = _mm256_add_epi32(hi, shift_rounding);
+ lo = _mm256_srai_epi32(lo, -shift);
+ hi = _mm256_srai_epi32(hi, -shift);
+ const __m256i x = _mm256_packs_epi32(lo, hi);
+ write_recon_w16_avx2(x, output);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size,
+ int32_t eob) {
+ (void)eob;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m256i buf[32];
+ for (int i = 0; i < input_stride; i += 16) {
+ iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+ txw_idx, rect_type);
+ iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+ txh_idx);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+ const int input_stride = txfm_size_col_notzero;
+ const int buf_size_w_div16 = (eobx + 16) >> 4;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i buf0[64];
+ iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+ eoby + 1, txw_idx, rect_type);
+ col_txfm(buf0, buf0, cos_bit_col);
+ __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+ write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+ assert(row_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div16; i++) {
+ __m256i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 16;
+ for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+ buf0_cur, 16);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+ __m256i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_av2(buf0 + 16 * j, temp, 16);
+ transpose_16bit_16x16_avx2(temp,
+ _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+ }
+ }
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+ buf1 + j * 16, shift[1], 16, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ case ADST_ADST: // ADST in both directions
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ case TX_8X8:
+ case TX_4X8:
+ case TX_8X4:
+ case TX_8X16:
+ case TX_16X8:
+ case TX_4X16:
+ case TX_16X4:
+ case TX_8X32:
+ case TX_32X8:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X16:
+ case TX_32X32:
+ case TX_64X64:
+ case TX_16X32:
+ case TX_32X16:
+ case TX_32X64:
+ case TX_64X32:
+ case TX_16X64:
+ case TX_64X16:
+ default:
+ lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 000000000..c17f655c5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_w16_epi16(a, b) \
+ _mm256_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+#define btf_16_w16_avx2(w0, w1, in0, in1, out0, out1) \
+ { \
+ __m256i t0 = _mm256_unpacklo_epi16(in0, in1); \
+ __m256i t1 = _mm256_unpackhi_epi16(in0, in1); \
+ __m256i u0 = _mm256_madd_epi16(t0, w0); \
+ __m256i u1 = _mm256_madd_epi16(t1, w0); \
+ __m256i v0 = _mm256_madd_epi16(t0, w1); \
+ __m256i v1 = _mm256_madd_epi16(t1, w1); \
+ \
+ __m256i a0 = _mm256_add_epi32(u0, __rounding); \
+ __m256i a1 = _mm256_add_epi32(u1, __rounding); \
+ __m256i b0 = _mm256_add_epi32(v0, __rounding); \
+ __m256i b1 = _mm256_add_epi32(v1, __rounding); \
+ \
+ __m256i c0 = _mm256_srai_epi32(a0, cos_bit); \
+ __m256i c1 = _mm256_srai_epi32(a1, cos_bit); \
+ __m256i d0 = _mm256_srai_epi32(b0, cos_bit); \
+ __m256i d1 = _mm256_srai_epi32(b1, cos_bit); \
+ \
+ out0 = _mm256_packs_epi32(c0, c1); \
+ out1 = _mm256_packs_epi32(d0, d1); \
+ }
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
+ { \
+ const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+ const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+ const __m256i _in = in; \
+ out0 = _mm256_mulhrs_epi16(_in, _w0); \
+ out1 = _mm256_mulhrs_epi16(_in, _w1); \
+ }
+
+#define btf_16_adds_subs_avx2(in0, in1) \
+ { \
+ const __m256i _in0 = in0; \
+ const __m256i _in1 = in1; \
+ in0 = _mm256_adds_epi16(_in0, _in1); \
+ in1 = _mm256_subs_epi16(_in0, _in1); \
+ }
+
+#define btf_16_subs_adds_avx2(in0, in1) \
+ { \
+ const __m256i _in0 = in0; \
+ const __m256i _in1 = in1; \
+ in1 = _mm256_subs_epi16(_in0, _in1); \
+ in0 = _mm256_adds_epi16(_in0, _in1); \
+ }
+
+#define btf_16_adds_subs_out_avx2(out0, out1, in0, in1) \
+ { \
+ const __m256i _in0 = in0; \
+ const __m256i _in1 = in1; \
+ out0 = _mm256_adds_epi16(_in0, _in1); \
+ out1 = _mm256_subs_epi16(_in0, _in1); \
+ }
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+ const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+ const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+ return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f
+ // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f
+ // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f
+ // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f
+ // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f
+ // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f
+ // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f
+ // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f
+ // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // ...
+ __m256i a[16];
+ for (int i = 0; i < 16; i += 2) {
+ a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+ a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+ }
+ __m256i b[16];
+ for (int i = 0; i < 16; i += 2) {
+ b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+ b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+ }
+ __m256i c[16];
+ for (int i = 0; i < 16; i += 2) {
+ c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+ c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+ }
+ out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+ out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+ out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+ out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+ out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+ out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+ out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+ out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+ out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+ out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+ out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+ out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+ out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+ out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+ out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+ out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+ if (bit < 0) {
+ __m256i scale = _mm256_set1_epi16(1 << (bit + 15));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+ int size) {
+ const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm256_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void flip_buf_av2(__m256i *in, __m256i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+ __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+ __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+ __m128i y = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+ _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ write_recon_w16_avx2(in[j], output + i * stride);
+ }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+ int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 000000000..dd7cee24c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2917 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 4
+ // stage 5
+ output[0] = x[0];
+ output[7] = x[0];
+ output[1] = x[1];
+ output[6] = x[1];
+ output[2] = x[1];
+ output[5] = x[1];
+ output[3] = x[0];
+ output[4] = x[0];
+}
+
+void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+ btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+ btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+ btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+ btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+ btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+ btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+ btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ output[0] = x[0];
+ output[15] = x[0];
+ output[1] = x[1];
+ output[14] = x[1];
+ output[2] = x[1];
+ output[13] = x[1];
+ output[3] = x[0];
+ output[12] = x[0];
+ output[4] = x[0];
+ output[11] = x[0];
+ output[5] = x[1];
+ output[10] = x[1];
+ output[6] = x[1];
+ output[9] = x[1];
+ output[7] = x[0];
+ output[8] = x[0];
+}
+
+static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[2] = input[4];
+ x[4] = input[2];
+ x[6] = input[6];
+ x[8] = input[1];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5~7
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+ // stage 7
+ idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+ btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+ btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+ btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+ btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+ btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+ btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+ btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+ btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+ btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+ btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+ btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+ btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+ btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+ btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+ btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[1] = input[16];
+ x[2] = input[8];
+ x[3] = input[24];
+ x[4] = input[4];
+ x[5] = input[20];
+ x[6] = input[12];
+ x[7] = input[28];
+ x[8] = input[2];
+ x[9] = input[18];
+ x[10] = input[10];
+ x[11] = input[26];
+ x[12] = input[6];
+ x[13] = input[22];
+ x[14] = input[14];
+ x[15] = input[30];
+ x[16] = input[1];
+ x[17] = input[17];
+ x[18] = input[9];
+ x[19] = input[25];
+ x[20] = input[5];
+ x[21] = input[21];
+ x[22] = input[13];
+ x[23] = input[29];
+ x[24] = input[3];
+ x[25] = input[19];
+ x[26] = input[11];
+ x[27] = input[27];
+ x[28] = input[7];
+ x[29] = input[23];
+ x[30] = input[15];
+ x[31] = input[31];
+
+ // stage 2
+ btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+ btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_adds_subs_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7~8
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_adds_subs_sse2(x[32], x[35]);
+ btf_16_adds_subs_sse2(x[33], x[34]);
+ btf_16_subs_adds_sse2(x[39], x[36]);
+ btf_16_subs_adds_sse2(x[38], x[37]);
+ btf_16_adds_subs_sse2(x[40], x[43]);
+ btf_16_adds_subs_sse2(x[41], x[42]);
+ btf_16_subs_adds_sse2(x[47], x[44]);
+ btf_16_subs_adds_sse2(x[46], x[45]);
+ btf_16_adds_subs_sse2(x[48], x[51]);
+ btf_16_adds_subs_sse2(x[49], x[50]);
+ btf_16_subs_adds_sse2(x[55], x[52]);
+ btf_16_subs_adds_sse2(x[54], x[53]);
+ btf_16_adds_subs_sse2(x[56], x[59]);
+ btf_16_adds_subs_sse2(x[57], x[58]);
+ btf_16_subs_adds_sse2(x[63], x[60]);
+ btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_adds_subs_sse2(x[32], x[39]);
+ btf_16_adds_subs_sse2(x[33], x[38]);
+ btf_16_adds_subs_sse2(x[34], x[37]);
+ btf_16_adds_subs_sse2(x[35], x[36]);
+ btf_16_subs_adds_sse2(x[47], x[40]);
+ btf_16_subs_adds_sse2(x[46], x[41]);
+ btf_16_subs_adds_sse2(x[45], x[42]);
+ btf_16_subs_adds_sse2(x[44], x[43]);
+ btf_16_adds_subs_sse2(x[48], x[55]);
+ btf_16_adds_subs_sse2(x[49], x[54]);
+ btf_16_adds_subs_sse2(x[50], x[53]);
+ btf_16_adds_subs_sse2(x[51], x[52]);
+ btf_16_subs_adds_sse2(x[63], x[56]);
+ btf_16_subs_adds_sse2(x[62], x[57]);
+ btf_16_subs_adds_sse2(x[61], x[58]);
+ btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[47]);
+ btf_16_adds_subs_sse2(x[33], x[46]);
+ btf_16_adds_subs_sse2(x[34], x[45]);
+ btf_16_adds_subs_sse2(x[35], x[44]);
+ btf_16_adds_subs_sse2(x[36], x[43]);
+ btf_16_adds_subs_sse2(x[37], x[42]);
+ btf_16_adds_subs_sse2(x[38], x[41]);
+ btf_16_adds_subs_sse2(x[39], x[40]);
+ btf_16_subs_adds_sse2(x[63], x[48]);
+ btf_16_subs_adds_sse2(x[62], x[49]);
+ btf_16_subs_adds_sse2(x[61], x[50]);
+ btf_16_subs_adds_sse2(x[60], x[51]);
+ btf_16_subs_adds_sse2(x[59], x[52]);
+ btf_16_subs_adds_sse2(x[58], x[53]);
+ btf_16_subs_adds_sse2(x[57], x[54]);
+ btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[31]);
+ btf_16_adds_subs_sse2(x[1], x[30]);
+ btf_16_adds_subs_sse2(x[2], x[29]);
+ btf_16_adds_subs_sse2(x[3], x[28]);
+ btf_16_adds_subs_sse2(x[4], x[27]);
+ btf_16_adds_subs_sse2(x[5], x[26]);
+ btf_16_adds_subs_sse2(x[6], x[25]);
+ btf_16_adds_subs_sse2(x[7], x[24]);
+ btf_16_adds_subs_sse2(x[8], x[23]);
+ btf_16_adds_subs_sse2(x[9], x[22]);
+ btf_16_adds_subs_sse2(x[10], x[21]);
+ btf_16_adds_subs_sse2(x[11], x[20]);
+ btf_16_adds_subs_sse2(x[12], x[19]);
+ btf_16_adds_subs_sse2(x[13], x[18]);
+ btf_16_adds_subs_sse2(x[14], x[17]);
+ btf_16_adds_subs_sse2(x[15], x[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+ btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+ btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+ btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+ btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+ btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+ btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+ btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+ btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+ btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+ btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+ btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+ btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+ btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+ btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+ btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+ btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+ btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+ btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+ btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+ btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+ btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+ btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+ btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+ btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+ btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+ btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+ btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+ btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+ btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+ btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+ btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ x[9] = x[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[33]);
+ btf_16_subs_adds_sse2(x[35], x[34]);
+ btf_16_adds_subs_sse2(x[36], x[37]);
+ btf_16_subs_adds_sse2(x[39], x[38]);
+ btf_16_adds_subs_sse2(x[40], x[41]);
+ btf_16_subs_adds_sse2(x[43], x[42]);
+ btf_16_adds_subs_sse2(x[44], x[45]);
+ btf_16_subs_adds_sse2(x[47], x[46]);
+ btf_16_adds_subs_sse2(x[48], x[49]);
+ btf_16_subs_adds_sse2(x[51], x[50]);
+ btf_16_adds_subs_sse2(x[52], x[53]);
+ btf_16_subs_adds_sse2(x[55], x[54]);
+ btf_16_adds_subs_sse2(x[56], x[57]);
+ btf_16_subs_adds_sse2(x[59], x[58]);
+ btf_16_adds_subs_sse2(x[60], x[61]);
+ btf_16_subs_adds_sse2(x[63], x[62]);
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[4];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+ u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+ u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+ __m128i x1[16];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+ x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+ x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+ x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+ x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+ x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
+ x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+ x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+ x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+ x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+ __m128i x2[8];
+ x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[5]);
+ x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+ x2[3] = _mm_add_epi32(x1[3], x1[7]);
+ x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
+ x2[5] = _mm_add_epi32(x1[9], x1[11]);
+ x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+ x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+ __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out1);
+ }
+}
+
+// TODO(binpengsmail@gmail.com):
+// To explore the reuse of VP9 versions of corresponding SSE2 functions and
+// evaluate whether there is a possibility for further speedup.
+void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[2];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+ __m128i x1[8];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
+ x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+
+ __m128i x2[4];
+ x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+ x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
+ x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[i], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out0);
+ }
+}
+
+static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+ // stage 3
+ x[4] = x[0];
+ x[5] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+ // stage 5
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[8]);
+ btf_16_adds_subs_sse2(x[1], x[9]);
+ btf_16_adds_subs_sse2(x[2], x[10]);
+ btf_16_adds_subs_sse2(x[3], x[11]);
+ btf_16_adds_subs_sse2(x[4], x[12]);
+ btf_16_adds_subs_sse2(x[5], x[13]);
+ btf_16_adds_subs_sse2(x[6], x[14]);
+ btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[12]);
+ btf_16_adds_subs_sse2(x[9], x[13]);
+ btf_16_adds_subs_sse2(x[10], x[14]);
+ btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[10]);
+ btf_16_adds_subs_sse2(x[9], x[11]);
+ btf_16_adds_subs_sse2(x[12], x[14]);
+ btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+ const __m128i __zero = _mm_setzero_si128();
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[8]);
+ output[2] = x[12];
+ output[3] = _mm_subs_epi16(__zero, x[4]);
+ output[4] = x[6];
+ output[5] = _mm_subs_epi16(__zero, x[14]);
+ output[6] = x[10];
+ output[7] = _mm_subs_epi16(__zero, x[2]);
+ output[8] = x[3];
+ output[9] = _mm_subs_epi16(__zero, x[11]);
+ output[10] = x[15];
+ output[11] = _mm_subs_epi16(__zero, x[7]);
+ output[12] = x[5];
+ output[13] = _mm_subs_epi16(__zero, x[13]);
+ output[14] = x[9];
+ output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+ // stage 3
+ x[8] = x[0];
+ x[9] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+ // stage 5
+ x[4] = x[0];
+ x[5] = x[1];
+ x[12] = x[8];
+ x[13] = x[9];
+
+ // stage 6
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+ // stage 7
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+ x[10] = x[8];
+ x[11] = x[9];
+ x[14] = x[12];
+ x[15] = x[13];
+
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+ x[3] = input[2];
+ x[5] = input[4];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[1];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+ btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+ btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+ btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+ btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+ btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+ btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+ btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3~9
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+ // stage 5
+ iadst16_stage5_ssse3(x);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+ // stage 7
+ iadst16_stage7_ssse3(x);
+
+ // stage 8
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+ // stage 9
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 4; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ output[i] = _mm_adds_epi16(x, input[i]);
+ }
+}
+
+static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 8; ++i) {
+ output[i] = _mm_adds_epi16(input[i], input[i]);
+ }
+}
+
+static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 16; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+ output[i] = _mm_adds_epi16(x, srcx2);
+ }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+ __m128i res) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+ return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+ __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+ u = _mm_packus_epi16(u, zero);
+ *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+ }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
+ { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+ { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
+ { idct32_new_sse2, NULL, NULL },
+ { idct64_low32_new_ssse3, NULL, NULL },
+ };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
+ { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
+ { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+ },
+ { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
+ { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
+ { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+ {
+ { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
+ NULL },
+ { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
+ idct32_new_sse2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
+ idct64_low32_new_ssse3 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
+ { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
+ { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+ { NULL, NULL, NULL },
+ { NULL, NULL, NULL },
+ };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m128i src = load_32bit_to_16bit(input_row);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m128i rect_scale =
+ _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m128i src = load_32bit_to_16bit(input_row);
+ src = _mm_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+ __m128i *buf, int shift, int height,
+ int txh_idx) {
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+ const __m128i zero = _mm_setzero_si128();
+ for (int h = 0; h < height; ++h) {
+ __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+ __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+ lo = _mm_madd_epi16(lo, scale_coeff);
+ hi = _mm_madd_epi16(hi, scale_coeff);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm_add_epi32(lo, shift_rounding);
+ hi = _mm_add_epi32(hi, shift_rounding);
+ lo = _mm_srai_epi32(lo, -shift);
+ hi = _mm_srai_epi32(hi, -shift);
+ __m128i x = _mm_packs_epi32(lo, hi);
+
+ const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+ x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+ const __m128i u = _mm_packus_epi16(x, x);
+ _mm_storel_epi64((__m128i *)(output), u);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size) {
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m128i buf[32];
+
+ for (int i = 0; i < (input_stride >> 3); ++i) {
+ iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+ txw_idx, rect_type);
+ iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+ txh_idx);
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[4];
+ const TX_SIZE tx_size = TX_4X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x4(buf, buf);
+ row_txfm(buf, buf, cos_bit_row);
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x4(temp, buf);
+ } else {
+ transpose_16bit_4x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+ __m128i res0, __m128i res1) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+ __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+ x0 = _mm_adds_epi16(res0, x0);
+ x1 = _mm_adds_epi16(res1, x1);
+ return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+ int size) {
+ const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp,
+ _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+ round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = (eobx + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ assert(fun_idx < 5);
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i buf0[64];
+ iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+ eoby + 1, txw_idx, rect_type);
+ col_txfm(buf0, buf0, cos_bit_col);
+ __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ uint8_t *out = output + 8 * i;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+ __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+ _mm_storel_epi64((__m128i *)(out), u);
+ out += stride;
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div8; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+ }
+ }
+
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+ buf1 + j * 8, shift[1], 8, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case DCT_DCT:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_4X8;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x8(buf, buf);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf);
+ } else {
+ transpose_16bit_8x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_8x4(buf, buf);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x8(temp, buf);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_4X16;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ const int row_one_loop = 8;
+ for (int i = 0; i < 2; ++i) {
+ const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+ row_one_loop);
+ transpose_16bit_4x8(buf_cur, buf_cur);
+ row_txfm(buf_cur, buf_cur, cos_bit_row);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf_cur, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf_cur);
+ } else {
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_16X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int row_one_loop = 8;
+ for (int i = 0; i < buf_size_w_div8; ++i) {
+ const int32_t *input_cur = input + i * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+ txfm_size_row);
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ if (lr_flip) {
+ __m128i temp[16];
+ flip_buf_sse2(buf, temp, 16);
+ transpose_16bit_4x8(temp, buf);
+ transpose_16bit_4x8(temp + 8, buf + 8);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+ round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ }
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+ lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 000000000..dc9be25d2
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h> // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1) \
+ do { \
+ const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+ const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+ const __m128i _in = in; \
+ out0 = _mm_mulhrs_epi16(_in, _w0); \
+ out1 = _mm_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ out0 = _mm_adds_epi16(_in0, _in1); \
+ out1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
deleted file mode 100644
index fd0a6ed2c..000000000
--- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef AV1_TXMF1D_SSE2_H_
-#define AV1_TXMF1D_SSE2_H_
-
-#include <smmintrin.h>
-#include "av1/common/av1_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-
-void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range);
-
-static INLINE void transpose_32_4x4(int stride, const __m128i *input,
- __m128i *output) {
- __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
- __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
- __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
- __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
-
- output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
- output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
- output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
- output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
-}
-
-// the entire input block can be represent by a grid of 4x4 blocks
-// each 4x4 blocks can be represent by 4 vertical __m128i
-// we first transpose each 4x4 block internally
-// then transpose the grid
-static INLINE void transpose_32(int txfm_size, const __m128i *input,
- __m128i *output) {
- const int num_per_128 = 4;
- const int row_size = txfm_size;
- const int col_size = txfm_size / num_per_128;
- int r, c;
-
- // transpose each 4x4 block internally
- for (r = 0; r < row_size; r += 4) {
- for (c = 0; c < col_size; c++) {
- transpose_32_4x4(col_size, &input[r * col_size + c],
- &output[c * 4 * col_size + r / 4]);
- }
- }
-}
-
-static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
- __m128i tmp, round;
- round = _mm_set1_epi32(1 << (bit - 1));
- tmp = _mm_add_epi32(vec, round);
- return _mm_srai_epi32(tmp, bit);
-}
-
-static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
- const int size, const int bit) {
- if (bit > 0) {
- int i;
- for (i = 0; i < size; i++) {
- output[i] = round_shift_32_sse4_1(input[i], bit);
- }
- } else {
- int i;
- for (i = 0; i < size; i++) {
- output[i] = _mm_slli_epi32(input[i], -bit);
- }
- }
-}
-
-// out0 = in0*w0 + in1*w1
-// out1 = -in1*w0 + in0*w1
-#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
- do { \
- __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
- ww0 = _mm_set1_epi32(w0); \
- ww1 = _mm_set1_epi32(w1); \
- in0_w0 = _mm_mullo_epi32(in0, ww0); \
- in1_w1 = _mm_mullo_epi32(in1, ww1); \
- out0 = _mm_add_epi32(in0_w0, in1_w1); \
- out0 = round_shift_32_sse4_1(out0, bit); \
- in0_w1 = _mm_mullo_epi32(in0, ww1); \
- in1_w0 = _mm_mullo_epi32(in1, ww0); \
- out1 = _mm_sub_epi32(in0_w1, in1_w0); \
- out1 = round_shift_32_sse4_1(out1, bit); \
- } while (0)
-
-// out0 = in0*w0 + in1*w1
-// out1 = in1*w0 - in0*w1
-#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
- do { \
- __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
- ww0 = _mm_set1_epi32(w0); \
- ww1 = _mm_set1_epi32(w1); \
- in0_w0 = _mm_mullo_epi32(in0, ww0); \
- in1_w1 = _mm_mullo_epi32(in1, ww1); \
- out0 = _mm_add_epi32(in0_w0, in1_w1); \
- out0 = round_shift_32_sse4_1(out0, bit); \
- in0_w1 = _mm_mullo_epi32(in0, ww1); \
- in1_w0 = _mm_mullo_epi32(in1, ww0); \
- out1 = _mm_sub_epi32(in1_w0, in0_w1); \
- out1 = round_shift_32_sse4_1(out1, bit); \
- } while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // AV1_TXMF1D_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 000000000..721cfe059
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+ const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+ const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+ __m128i *const out0, __m128i *const out1) {
+ const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i u0 = _mm_madd_epi16(t0, *w0);
+ const __m128i v0 = _mm_madd_epi16(t0, *w1);
+ const __m128i a0 = _mm_add_epi32(u0, __rounding);
+ const __m128i b0 = _mm_add_epi32(v0, __rounding);
+ const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+ const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+ *out0 = _mm_packs_epi32(c0, c0);
+ *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+ { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c0); \
+ out1 = _mm_packs_epi32(d0, d0); \
+ }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+ { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i u1 = _mm_madd_epi16(t1, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ __m128i v1 = _mm_madd_epi16(t1, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i a1 = _mm_add_epi32(u1, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ __m128i b1 = _mm_add_epi32(v1, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c1); \
+ out1 = _mm_packs_epi32(d0, d1); \
+ }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+ const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m128i b = _mm_madd_epi16(a, scale_rounding);
+ return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+ _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w4(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+ uint16_t *out,
+ const int stride) {
+ for (int i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_adds_epi16(in[i], rounding);
+ in[i] = _mm_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+typedef struct {
+ transform_1d_sse2 col, row; // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 000000000..cccc62f03
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,10 @@
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+ __m128i *const vec = (__m128i *)arr;
+ const int vec_size = size >> 2;
+ av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 000000000..faf7251fa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,60 @@
+#ifndef AV1_TXFM_SSE4_H_
+#define AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+ __m128i tmp, round;
+ round = _mm_set1_epi32(1 << (bit - 1));
+ tmp = _mm_add_epi32(vec, round);
+ return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit) {
+ const __m128i sqrt2 = _mm_set1_epi32(NewSqrt2);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 000000000..a8bfdcce6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
+ }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+ __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+ __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+ _mm256_storeu_si256(row, sum_16x16);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+ _mm256_storeu_si256(row, top_16x16);
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ const __m256i zeros = _mm256_setzero_si256();
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+ __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+ row_lo = _mm256_slli_epi16(row_lo, 3);
+ __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+ row_hi = _mm256_slli_epi16(row_hi, 3);
+
+ _mm256_storeu_si256(row, row_lo);
+ _mm256_storeu_si256(row + 1, row_hi);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+ __m256i sum = _mm256_add_epi16(top, bot);
+
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+ __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+ __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_add_epi16(hsum, hsum);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i hsum = _mm256_hadd_epi16(top, top_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_slli_epi16(hsum, 2);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+ _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+ __m256i alpha_sign, __m256i dc_q0) {
+ __m256i ac_q3 = _mm256_loadu_si256(input);
+ __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+ __m256i scaled_luma_q0 =
+ _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ (void)width;
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+ do {
+ __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm256_packus_epi16(res, next);
+ res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+ _mm256_storeu_si256((__m256i *)dst, res);
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+ predict_lbd_4x4_ssse3, /* 4x4 */
+ predict_lbd_8x8_ssse3, /* 8x8 */
+ predict_lbd_16x16_ssse3, /* 16x16 */
+ predict_lbd_32x32_avx2, /* 32x32 */
+ cfl_predict_lbd_null, /* 64x64 (invalid CFL size) */
+ predict_lbd_4x8_ssse3, /* 4x8 */
+ predict_lbd_8x4_ssse3, /* 8x4 */
+ predict_lbd_8x16_ssse3, /* 8x16 */
+ predict_lbd_16x8_ssse3, /* 16x8 */
+ predict_lbd_16x32_ssse3, /* 16x32 */
+ predict_lbd_32x16_avx2, /* 32x16 */
+ cfl_predict_lbd_null, /* 32x64 (invalid CFL size) */
+ cfl_predict_lbd_null, /* 64x32 (invalid CFL size) */
+ predict_lbd_4x16_ssse3, /* 4x16 */
+ predict_lbd_16x4_ssse3, /* 16x4 */
+ predict_lbd_8x32_ssse3, /* 8x32 */
+ predict_lbd_32x8_avx2, /* 32x8 */
+ cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */
+ cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+static __m256i highbd_max_epi16(int bd) {
+ const __m256i neg_one = _mm256_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+ return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ // Use SSSE3 version for smaller widths
+ assert(width == 16 || width == 32);
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+ const __m256i max = highbd_max_epi16(bd);
+
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256((__m256i *)dst,
+ highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+ if (width == 32) {
+ const __m256i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256(
+ (__m256i *)(dst + 16),
+ highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+ predict_hbd_4x4_ssse3, /* 4x4 */
+ predict_hbd_8x8_ssse3, /* 8x8 */
+ predict_hbd_16x16_avx2, /* 16x16 */
+ predict_hbd_32x32_avx2, /* 32x32 */
+ cfl_predict_hbd_null, /* 64x64 (invalid CFL size) */
+ predict_hbd_4x8_ssse3, /* 4x8 */
+ predict_hbd_8x4_ssse3, /* 8x4 */
+ predict_hbd_8x16_ssse3, /* 8x16 */
+ predict_hbd_16x8_avx2, /* 16x8 */
+ predict_hbd_16x32_avx2, /* 16x32 */
+ predict_hbd_32x16_avx2, /* 32x16 */
+ cfl_predict_hbd_null, /* 32x64 (invalid CFL size) */
+ cfl_predict_hbd_null, /* 64x32 (invalid CFL size) */
+ predict_hbd_4x16_ssse3, /* 4x16 */
+ predict_hbd_16x4_avx2, /* 16x4 */
+ predict_hbd_8x32_ssse3, /* 8x32 */
+ predict_hbd_32x8_avx2, /* 32x8 */
+ cfl_predict_hbd_null, /* 16x64 (invalid CFL size) */
+ cfl_predict_hbd_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+ // Given that a == [A, B, C, D, E, F, G, H]
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+ // a == [A', C', A', C', E', G', E', G']
+ a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+ // a == [A', C', E', G', A', C', E', G']
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A'' == A' + C' and E'' == E' + G'
+ // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+ return _mm256_hadd_epi32(a, a);
+ // Given that A''' == A'' + E''
+ // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+ return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ // Use SSE2 version for smaller widths
+ assert(width == 16 || width == 32);
+
+ const __m256i *src = (__m256i *)src_ptr;
+ const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+ // To maximize usage of the AVX2 registers, we sum two rows per loop
+ // iteration
+ const int step = 2 * CFL_BUF_LINE_I256;
+
+ __m256i sum = _mm256_setzero_si256();
+ // For width 32, we use a second sum accumulator to reduce accumulator
+ // dependencies in the loop.
+ __m256i sum2;
+ if (width == 32) sum2 = _mm256_setzero_si256();
+
+ do {
+ // Add top row to the bottom row
+ __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+ _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+ sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+ if (width == 32) { /* Don't worry, this if it gets optimized out. */
+ // Add the second part of the top row to the second part of the bottom row
+ __m256i l1 =
+ _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+ _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+ sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+ }
+ src += step;
+ } while (src < end);
+ // Combine both sum accumulators
+ if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+ __m256i fill = fill_sum_epi32(sum);
+
+ __m256i avg_epi16 = _mm256_srli_epi32(
+ _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+ avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+ // Store and subtract loop
+ src = (__m256i *)src_ptr;
+ __m256i *dst = (__m256i *)dst_ptr;
+ do {
+ _mm256_storeu_si256(dst,
+ _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+ if (width == 32) {
+ _mm256_storeu_si256(
+ dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+ }
+ src += CFL_BUF_LINE_I256;
+ dst += CFL_BUF_LINE_I256;
+ } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ subtract_average_4x4_sse2, /* 4x4 */
+ subtract_average_8x8_sse2, /* 8x8 */
+ subtract_average_16x16_avx2, /* 16x16 */
+ subtract_average_32x32_avx2, /* 32x32 */
+ cfl_subtract_average_null, /* 64x64 (invalid CFL size) */
+ subtract_average_4x8_sse2, /* 4x8 */
+ subtract_average_8x4_sse2, /* 8x4 */
+ subtract_average_8x16_sse2, /* 8x16 */
+ subtract_average_16x8_avx2, /* 16x8 */
+ subtract_average_16x32_avx2, /* 16x32 */
+ subtract_average_32x16_avx2, /* 32x16 */
+ cfl_subtract_average_null, /* 32x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x32 (invalid CFL size) */
+ subtract_average_4x16_sse2, /* 4x16 */
+ subtract_average_16x4_avx2, /* 16x4 */
+ subtract_average_8x32_sse2, /* 8x32 */
+ subtract_average_32x8_avx2, /* 32x8 */
+ cfl_subtract_average_null, /* 16x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 000000000..7479ac3e1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 000000000..4783fe098
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+ l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+ return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ const __m128i zeros = _mm_setzero_si128();
+ const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+ const __m128i *src = (__m128i *)src_ptr;
+ const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+ const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+ __m128i sum = zeros;
+ do {
+ __m128i l0;
+ if (width == 4) {
+ l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+ _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+ __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+ _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpacklo_epi16(l1, zeros)));
+ } else {
+ if (width == 8) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src),
+ _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+ } else {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+ }
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ if (width == 32) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ }
+ }
+ src += step;
+ } while (src < end);
+
+ sum = fill_sum_epi32(sum);
+
+ __m128i avg_epi16 =
+ _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+ avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+ src = (__m128i *)src_ptr;
+ __m128i *dst = (__m128i *)dst_ptr;
+ do {
+ if (width == 4) {
+ _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+ } else {
+ _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+ if (width > 8) {
+ _mm_storeu_si128(dst + 1,
+ _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+ if (width == 32) {
+ _mm_storeu_si128(dst + 2,
+ _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+ _mm_storeu_si128(dst + 3,
+ _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+ }
+ }
+ }
+ src += CFL_BUF_LINE_I128;
+ dst += CFL_BUF_LINE_I128;
+ } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 000000000..bbf007295
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+ return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+ *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i twos = _mm_set1_epi8(2);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, twos);
+ bot_1 = _mm_maddubs_epi16(bot_1, twos);
+ __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i fours = _mm_set1_epi8(4);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeh_epi32(pred_buf_m128i, top);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storel_epi64(pred_buf_m128i, top);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeu_si128(pred_buf_m128i, top);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, fours);
+ _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+ }
+ }
+ input += input_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i zeros = _mm_setzero_si128();
+ const int luma_stride = input_stride;
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i row = _mm_loadh_epi32((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else if (width == 8) {
+ __m128i row = _mm_loadl_epi64((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else {
+ __m128i row = _mm_loadu_si128((__m128i *)input);
+ const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+ const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+ if (width == 32) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+ const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+ _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ sum = _mm_hadd_epi16(sum, sum);
+ *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ if (width == 8) {
+ sum = _mm_hadd_epi16(sum, sum);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+ _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i bot_2 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i bot_3 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+ const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+ const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+ __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+ _mm_add_epi16(next_sum, next_sum));
+ }
+ }
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ if (width == 8) {
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ }
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ input += input_stride;
+ } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+ } else {
+ const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+ _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+ if (width >= 16) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ row_1 = _mm_slli_epi16(row_1, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+ if (width == 32) {
+ __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ row_2 = _mm_slli_epi16(row_2, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+ __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ row_3 = _mm_slli_epi16(row_3, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+ }
+ }
+ }
+ input += input_stride;
+ pred_buf_q3 += CFL_BUF_LINE;
+ } while (pred_buf_q3 < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = _mm_loadu_si128(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4)
+ _mm_storeh_epi32((__m128i *)dst, res);
+ else
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)dst, res);
+ if (width == 32) {
+ res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)(dst + 16), res);
+ }
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+static INLINE __m128i highbd_max_epi16(int bd) {
+ const __m128i neg_one = _mm_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ const __m128i max = highbd_max_epi16(bd);
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ res = highbd_clamp_epi16(res, zeros, max);
+ if (width == 4) {
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ _mm_storeu_si128((__m128i *)dst, res);
+ }
+ if (width >= 16) {
+ const __m128i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128(((__m128i *)dst) + 1,
+ highbd_clamp_epi16(res_1, zeros, max));
+ }
+ if (width == 32) {
+ const __m128i res_2 =
+ predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 16),
+ highbd_clamp_epi16(res_2, zeros, max));
+ const __m128i res_3 =
+ predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 24),
+ highbd_clamp_epi16(res_3, zeros, max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 000000000..fd5e90a2e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ for (j = 0; j < w; j += 8) {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+ res =
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+
+ /* Vertical filter */
+ {
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ __m256i s[8];
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ __m256i res_a = convolve(s, coeffs_v);
+ __m256i res_b = convolve(s + 4, coeffs_v);
+
+ // Combine V round and 2F-H-V round into a single rounding
+ res_a =
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+ res_b =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ // 8 bit conversion and saturation to uint8
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ // Store values into the destination buffer
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ if (w - j > 4) {
+ _mm_storel_epi64(p_0, res_0);
+ _mm_storel_epi64(p_1, res_1);
+ } else if (w == 4) {
+ xx_storel_32(p_0, res_0);
+ xx_storel_32(p_1, res_1);
+ } else {
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
index e4d352c0e..fc0e65453 100644
--- a/third_party/aom/av1/common/x86/convolve_2d_sse2.c
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -11,197 +11,20 @@
#include <emmintrin.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
- DECLARE_ALIGNED(16, uint8_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
- const __m128i zero = _mm_setzero_si128();
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- // Filter even-index pixels
- const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_packus_epi16(res, res);
- _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const uint8_t *data = &im_block[i * im_stride + j];
- const __m128i src_01 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
- const __m128i src_23 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
- const __m128i src_45 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
- const __m128i src_67 = _mm_unpacklo_epi8(
- _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
- _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
-
- const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
- const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
- const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
- const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
- const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
- const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
- const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
- }
- }
- }
-}
-#else
-void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
- InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
const int bd = 8;
DECLARE_ALIGNED(16, int16_t,
@@ -211,10 +34,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
/* Horizontal filter */
{
@@ -237,7 +64,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
for (i = 0; i < im_h; ++i) {
@@ -302,10 +129,14 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
// coeffs 6 7 6 7 6 7 6 7
const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+ const __m128i sum_round =
+ _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 8) {
@@ -358,24 +189,285 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ if (w == 2) {
+ *(uint16_t *)p = _mm_cvtsi128_si32(res);
+ } else if (w == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res);
+ } else {
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ int i, j;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert((w % 4) == 0);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+ const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+ const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+ const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+ const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+ const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+ const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+ const __m128i data_ref_0_hi =
+ _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+ const __m128i comp_avg_res_lo =
+ comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i comp_avg_res_hi =
+ comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 =
+ _mm_packus_epi16(round_result_lo, round_result_hi);
+
+ _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+ _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+ const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+ const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+ else
+ *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
} else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
}
}
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
}
}
}
-#endif
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
index a0e58716d..6fdfb0954 100644
--- a/third_party/aom/av1/common/x86/convolve_avx2.c
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -11,332 +11,267 @@
#include <immintrin.h>
-#include "aom_dsp/aom_dsp_common.h"
-#include "./av1_rtcd.h"
-
-#if CONFIG_CONVOLVE_ROUND
-static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
-
-// 16 epi16 pixels
-static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
- __m256i clamped, mask;
-
- mask = _mm256_cmpgt_epi16(*u, max);
- clamped = _mm256_andnot_si256(mask, *u);
- mask = _mm256_and_si256(mask, max);
- clamped = _mm256_or_si256(mask, clamped);
-
- const __m256i zero = _mm256_setzero_si256();
- mask = _mm256_cmpgt_epi16(clamped, zero);
- *u = _mm256_and_si256(clamped, mask);
-}
-
-// 8 epi16 pixels
-static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- __m128i clamped, mask;
-
- mask = _mm_cmpgt_epi16(*u, max);
- clamped = _mm_andnot_si128(mask, *u);
- mask = _mm_and_si128(mask, max);
- clamped = _mm_or_si128(mask, clamped);
-
- const __m128i zero = _mm_setzero_si128();
- mask = _mm_cmpgt_epi16(clamped, zero);
- *u = _mm_and_si128(clamped, mask);
-}
-
-// Work on multiple of 32 pixels
-static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
- const __m256i *rnd, int shift,
- int num) {
- do {
- __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
- __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
- __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
- __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
- x0 = _mm256_add_epi32(x0, *rnd);
- x1 = _mm256_add_epi32(x1, *rnd);
- x2 = _mm256_add_epi32(x2, *rnd);
- x3 = _mm256_add_epi32(x3, *rnd);
-
- x0 = _mm256_srai_epi32(x0, shift);
- x1 = _mm256_srai_epi32(x1, shift);
- x2 = _mm256_srai_epi32(x2, shift);
- x3 = _mm256_srai_epi32(x3, shift);
-
- x0 = _mm256_packs_epi32(x0, x1);
- x2 = _mm256_packs_epi32(x2, x3);
-
- pixel_clamp_avx2(&x0, 8);
- pixel_clamp_avx2(&x2, 8);
-
- x0 = _mm256_packus_epi16(x0, x2);
- x1 = _mm256_loadu_si256((const __m256i *)sindex);
- x2 = _mm256_permutevar8x32_epi32(x0, x1);
-
- _mm256_storeu_si256((__m256i *)dst, x2);
- src += 32;
- dst += 32;
- num--;
- } while (num > 0);
-}
-
-static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
- const __m256i *rnd, int shift) {
- __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
- __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
- x0 = _mm256_add_epi32(x0, *rnd);
- x1 = _mm256_add_epi32(x1, *rnd);
-
- x0 = _mm256_srai_epi32(x0, shift);
- x1 = _mm256_srai_epi32(x1, shift);
-
- x0 = _mm256_packs_epi32(x0, x1);
- pixel_clamp_avx2(&x0, 8);
-
- const __m256i x2 = _mm256_packus_epi16(x0, x0);
- x1 = _mm256_loadu_si256((const __m256i *)sindex);
- x0 = _mm256_permutevar8x32_epi32(x2, x1);
-
- _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
- const __m256i *rnd, int shift) {
- __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
- x0 = _mm256_add_epi32(x0, *rnd);
- x0 = _mm256_srai_epi32(x0, shift);
-
- x0 = _mm256_packs_epi32(x0, x0);
- pixel_clamp_avx2(&x0, 8);
-
- x0 = _mm256_packus_epi16(x0, x0);
- const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
- x0 = _mm256_permutevar8x32_epi32(x0, x1);
+#include "config/av1_rtcd.h"
- _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
-}
-
-static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
- const __m128i *rnd, int shift) {
- __m128i x = _mm_loadu_si128((const __m128i *)src);
- x = _mm_add_epi32(x, *rnd);
- x = _mm_srai_epi32(x, shift);
-
- x = _mm_packs_epi32(x, x);
- pixel_clamp_sse2(&x, 8);
-
- x = _mm_packus_epi16(x, x);
- *(uint32_t *)dst = _mm_cvtsi128_si32(x);
-}
-
-void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
- uint8_t *dst, int dst_stride, int w, int h,
- int bits) {
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
- const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
- if (w > 64) { // width = 128
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 32) { // width = 64
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 16) { // width = 32
- do {
- cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 8) { // width = 16
- do {
- cal_rounding_16_avx2(src, dst, &rnd_num, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 4) { // width = 8
- do {
- cal_rounding_8_avx2(src, dst, &rnd_num, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 2) { // width = 4
- do {
- cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else { // width = 2
- do {
- dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
- dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ // right shift is F-1 because we are already dividing
+ // filter co-efficients by 2
+ const int right_shift_bits = (FILTER_BITS - 1);
+ const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+ const __m256i right_shift_const =
+ _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+ __m256i coeffs[4], s[8];
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
}
}
-#if CONFIG_HIGHBITDEPTH
-static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
- uint16_t *dst,
- const __m256i *rnd, int shift,
- int num, int bd) {
- do {
- __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
- __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
- __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
- __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
-
- x0 = _mm256_add_epi32(x0, *rnd);
- x1 = _mm256_add_epi32(x1, *rnd);
- x2 = _mm256_add_epi32(x2, *rnd);
- x3 = _mm256_add_epi32(x3, *rnd);
-
- x0 = _mm256_srai_epi32(x0, shift);
- x1 = _mm256_srai_epi32(x1, shift);
- x2 = _mm256_srai_epi32(x2, shift);
- x3 = _mm256_srai_epi32(x3, shift);
-
- x0 = _mm256_packs_epi32(x0, x1);
- x2 = _mm256_packs_epi32(x2, x3);
-
- pixel_clamp_avx2(&x0, bd);
- pixel_clamp_avx2(&x2, bd);
-
- x0 = _mm256_permute4x64_epi64(x0, 0xD8);
- x2 = _mm256_permute4x64_epi64(x2, 0xD8);
-
- _mm256_storeu_si256((__m256i *)dst, x0);
- _mm256_storeu_si256((__m256i *)(dst + 16), x2);
- src += 32;
- dst += 32;
- num--;
- } while (num > 0);
-}
-
-static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
- uint16_t *dst,
- const __m256i *rnd, int shift,
- int bd) {
- __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
- __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
-
- x0 = _mm256_add_epi32(x0, *rnd);
- x1 = _mm256_add_epi32(x1, *rnd);
-
- x0 = _mm256_srai_epi32(x0, shift);
- x1 = _mm256_srai_epi32(x1, shift);
-
- x0 = _mm256_packs_epi32(x0, x1);
- pixel_clamp_avx2(&x0, bd);
-
- x0 = _mm256_permute4x64_epi64(x0, 0xD8);
- _mm256_storeu_si256((__m256i *)dst, x0);
-}
-
-static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
- const __m256i *rnd, int shift,
- int bd) {
- __m256i x = _mm256_loadu_si256((const __m256i *)src);
- x = _mm256_add_epi32(x, *rnd);
- x = _mm256_srai_epi32(x, shift);
-
- x = _mm256_packs_epi32(x, x);
- pixel_clamp_avx2(&x, bd);
-
- x = _mm256_permute4x64_epi64(x, 0xD8);
- _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
-}
-
-static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
- const __m128i *rnd, int shift,
- int bd) {
- __m128i x = _mm_loadu_si128((const __m128i *)src);
- x = _mm_add_epi32(x, *rnd);
- x = _mm_srai_epi32(x, shift);
-
- x = _mm_packs_epi32(x, x);
- pixel_clamp_sse2(&x, bd);
- _mm_storel_epi64((__m128i *)dst, x);
-}
-
-void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
- uint8_t *dst8, int dst_stride, int w,
- int h, int bits, int bd) {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
- const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
-
- if (w > 64) { // width = 128
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 32) { // width = 64
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 16) { // width = 32
- do {
- cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 8) { // width = 16
- do {
- cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 4) { // width = 8
- do {
- cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (w > 2) { // width = 4
- do {
- cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else { // width = 2
- do {
- dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
- dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
- src += src_stride;
- dst += dst_stride;
- h--;
- } while (h > 0);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ __m256i filt[4], coeffs[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+ const __m256i round_0_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+ const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+ assert(conv_params->round_0 > 0);
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b =
+ _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+ // 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
}
}
-#endif // CONFIG_HIGHBITDEPTH
-#endif // CONFIG_CONVOLVE_ROUND
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 000000000..18fe9ae5a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[4];
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+ if (w <= 4) {
+ __m128i s[8], src6, res, res_round, res16;
+ uint32_t res_int;
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = res_int;
+ else
+ *(uint32_t *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = res_int;
+ else
+ *(uint32_t *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m128i coeffs[4];
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+ if (w <= 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ uint32_t r = _mm_cvtsi128_si32(res);
+ if (w == 2)
+ *(uint16_t *)dst = r;
+ else
+ *(uint32_t *)dst = r;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
index 4f77da446..c11edc1d4 100644
--- a/third_party/aom/av1/common/x86/filterintra_sse4.c
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -11,888 +11,65 @@
#include <smmintrin.h>
-#include "./av1_rtcd.h"
-#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
#include "av1/common/enums.h"
#include "av1/common/reconintra.h"
-#if USE_3TAP_INTRA_FILTER
-void filterintra_sse4_3tap_dummy_func(void);
-void filterintra_sse4_3tap_dummy_func(void) {}
-#else
-
-static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
- __m128i *sum) {
- const __m128i a = _mm_loadu_si128((const __m128i *)above);
- const __m128i l = _mm_loadu_si128((const __m128i *)left);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i u0 = _mm_unpacklo_epi8(a, zero);
- __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
- sum[0] = _mm_add_epi16(u0, u1);
-}
-
-static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint16_t sum_value;
-
- AddPixelsSmall(above, left, &sum_vector);
-
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
-
- sum_value = _mm_extract_epi16(sum_vector, 0);
- sum_value += 4;
- sum_value >>= 3;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint16_t sum_value;
-
- AddPixelsSmall(above, left, &sum_vector);
-
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
-
- sum_value = _mm_extract_epi16(sum_vector, 0);
- sum_value += 8;
- sum_value >>= 4;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
- __m128i *sum) {
- const __m128i a = _mm_loadu_si128((const __m128i *)above);
- const __m128i l = _mm_loadu_si128((const __m128i *)left);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i u0 = _mm_unpacklo_epi8(a, zero);
- __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
- sum[0] = _mm_add_epi16(u0, u1);
-
- u0 = _mm_unpackhi_epi8(a, zero);
- u1 = _mm_unpackhi_epi8(l, zero);
-
- sum[0] = _mm_add_epi16(sum[0], u0);
- sum[0] = _mm_add_epi16(sum[0], u1);
-}
-
-static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint16_t sum_value;
-
- AddPixelsLarge(above, left, &sum_vector);
-
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
-
- sum_value = _mm_extract_epi16(sum_vector, 0);
- sum_value += 16;
- sum_value >>= 5;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector[2], u;
- uint16_t sum_value;
-
- AddPixelsLarge(above, left, &sum_vector[0]);
- AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
-
- sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
- sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
- sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector[0], 2);
- sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-
- sum_value = _mm_extract_epi16(sum_vector[0], 0);
- sum_value += 32;
- sum_value >>= 6;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-// Note:
-// params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
- const uint8_t *left, int bs,
- __m128i *params) {
- int meanValue = 0;
- switch (bs) {
- case 4: meanValue = GetMeanValue4x4(above, left, params); break;
- case 8: meanValue = GetMeanValue8x8(above, left, params); break;
- case 16: meanValue = GetMeanValue16x16(above, left, params); break;
- case 32: meanValue = GetMeanValue32x32(above, left, params); break;
- default: assert(0);
- }
- return meanValue;
-}
-
-// Note:
-// params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
-//
-static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
- const TX_SIZE tx_size =
- (bs == 32) ? TX_32X32
- : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
- // c0
- params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
- av1_filter_intra_taps_4[tx_size][mode][0],
- av1_filter_intra_taps_4[tx_size][mode][0],
- av1_filter_intra_taps_4[tx_size][mode][0]);
- // c1
- params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
- av1_filter_intra_taps_4[tx_size][mode][1],
- av1_filter_intra_taps_4[tx_size][mode][1],
- av1_filter_intra_taps_4[tx_size][mode][1]);
- // c2
- params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
- av1_filter_intra_taps_4[tx_size][mode][2],
- av1_filter_intra_taps_4[tx_size][mode][2],
- av1_filter_intra_taps_4[tx_size][mode][2]);
- // c3
- params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
- av1_filter_intra_taps_4[tx_size][mode][3],
- av1_filter_intra_taps_4[tx_size][mode][3],
- av1_filter_intra_taps_4[tx_size][mode][3]);
-}
-
-static const int maxBlkSize = 32;
-
-static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
- ptrdiff_t stride) {
- const int predStride = (maxBlkSize << 1) + 1;
- __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
- __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
- __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
- __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
-
- p0 = _mm_add_epi32(p0, mean[0]);
- p1 = _mm_add_epi32(p1, mean[0]);
- p2 = _mm_add_epi32(p2, mean[0]);
- p3 = _mm_add_epi32(p3, mean[0]);
-
- p0 = _mm_packus_epi32(p0, p1);
- p1 = _mm_packus_epi32(p2, p3);
- p0 = _mm_packus_epi16(p0, p1);
-
- *((int *)dst) = _mm_cvtsi128_si32(p0);
- p0 = _mm_srli_si128(p0, 4);
- *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
- p0 = _mm_srli_si128(p0, 4);
- *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
- p0 = _mm_srli_si128(p0, 4);
- *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
-}
-
-static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
- ptrdiff_t stride) {
- const int predStride = (maxBlkSize << 1) + 1;
- __m128i p0, p1, p2, p3;
- int r = 0;
-
- while (r < 8) {
- p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
- p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
- r += 1;
- p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
- p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-
- p0 = _mm_add_epi32(p0, mean[0]);
- p1 = _mm_add_epi32(p1, mean[0]);
- p2 = _mm_add_epi32(p2, mean[0]);
- p3 = _mm_add_epi32(p3, mean[0]);
-
- p0 = _mm_packus_epi32(p0, p1);
- p1 = _mm_packus_epi32(p2, p3);
- p0 = _mm_packus_epi16(p0, p1);
-
- _mm_storel_epi64((__m128i *)dst, p0);
- dst += stride;
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)dst, p0);
- dst += stride;
- r += 1;
- }
-}
-
-static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
- ptrdiff_t stride) {
- const int predStride = (maxBlkSize << 1) + 1;
- __m128i p0, p1, p2, p3;
- int r = 0;
-
- while (r < 16) {
- p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
- p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
- p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
- p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
- p0 = _mm_add_epi32(p0, mean[0]);
- p1 = _mm_add_epi32(p1, mean[0]);
- p2 = _mm_add_epi32(p2, mean[0]);
- p3 = _mm_add_epi32(p3, mean[0]);
-
- p0 = _mm_packus_epi32(p0, p1);
- p1 = _mm_packus_epi32(p2, p3);
- p0 = _mm_packus_epi16(p0, p1);
-
- _mm_storel_epi64((__m128i *)dst, p0);
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)(dst + 8), p0);
- dst += stride;
- r += 1;
- }
-}
-
-static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
- ptrdiff_t stride) {
- const int predStride = (maxBlkSize << 1) + 1;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
- int r = 0;
-
- while (r < 32) {
- p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
- p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
- p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
- p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
- p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
- p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
- p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
- p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
-
- p0 = _mm_add_epi32(p0, mean[0]);
- p1 = _mm_add_epi32(p1, mean[0]);
- p2 = _mm_add_epi32(p2, mean[0]);
- p3 = _mm_add_epi32(p3, mean[0]);
-
- p4 = _mm_add_epi32(p4, mean[0]);
- p5 = _mm_add_epi32(p5, mean[0]);
- p6 = _mm_add_epi32(p6, mean[0]);
- p7 = _mm_add_epi32(p7, mean[0]);
-
- p0 = _mm_packus_epi32(p0, p1);
- p1 = _mm_packus_epi32(p2, p3);
- p0 = _mm_packus_epi16(p0, p1);
-
- p4 = _mm_packus_epi32(p4, p5);
- p5 = _mm_packus_epi32(p6, p7);
- p4 = _mm_packus_epi16(p4, p5);
-
- _mm_storel_epi64((__m128i *)dst, p0);
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)(dst + 8), p0);
-
- _mm_storel_epi64((__m128i *)(dst + 16), p4);
- p4 = _mm_srli_si128(p4, 8);
- _mm_storel_epi64((__m128i *)(dst + 24), p4);
-
- dst += stride;
- r += 1;
- }
-}
-
-static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
- ptrdiff_t stride) {
- switch (bs) {
- case 4: SavePred4x4(pred, mean, dst, stride); break;
- case 8: SavePred8x8(pred, mean, dst, stride); break;
- case 16: SavePred16x16(pred, mean, dst, stride); break;
- case 32: SavePred32x32(pred, mean, dst, stride); break;
- default: assert(0);
- }
-}
-
-typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
- const int predStride);
-
-static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
- const int predStride) {
- __m128i u0, u1, u2;
- int c0 = _mm_extract_epi32(prm[1], 0);
- int x = *(pred + predStride);
- int sum;
-
- u0 = _mm_mullo_epi32(p[0], prm[2]);
- u1 = _mm_mullo_epi32(p[1], prm[0]);
- u2 = _mm_mullo_epi32(p[2], prm[3]);
-
- u0 = _mm_add_epi32(u0, u1);
- u0 = _mm_add_epi32(u0, u2);
-
- sum = _mm_extract_epi32(u0, 0);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 1) = x;
-
- sum = _mm_extract_epi32(u0, 1);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 2) = x;
-
- sum = _mm_extract_epi32(u0, 2);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 3) = x;
-
- sum = _mm_extract_epi32(u0, 3);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 4) = x;
-}
-
-static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
- const int predStride) {
- __m128i u0, u1, u2;
- int c0 = _mm_extract_epi32(prm[1], 0);
- int x = *(pred + predStride);
- int sum;
-
- u0 = _mm_mullo_epi32(p[0], prm[2]);
- u1 = _mm_mullo_epi32(p[1], prm[0]);
- u2 = _mm_mullo_epi32(p[2], prm[3]);
-
- u0 = _mm_add_epi32(u0, u1);
- u0 = _mm_add_epi32(u0, u2);
-
- sum = _mm_extract_epi32(u0, 0);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 1) = x;
-
- sum = _mm_extract_epi32(u0, 1);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 2) = x;
-
- sum = _mm_extract_epi32(u0, 2);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 3) = x;
-}
-
-static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
- const int predStride) {
- __m128i u0, u1, u2;
- int c0 = _mm_extract_epi32(prm[1], 0);
- int x = *(pred + predStride);
- int sum;
-
- u0 = _mm_mullo_epi32(p[0], prm[2]);
- u1 = _mm_mullo_epi32(p[1], prm[0]);
- u2 = _mm_mullo_epi32(p[2], prm[3]);
-
- u0 = _mm_add_epi32(u0, u1);
- u0 = _mm_add_epi32(u0, u2);
-
- sum = _mm_extract_epi32(u0, 0);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 1) = x;
-
- sum = _mm_extract_epi32(u0, 1);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 2) = x;
-}
-
-static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
- const int predStride) {
- __m128i u0, u1, u2;
- int c0 = _mm_extract_epi32(prm[1], 0);
- int x = *(pred + predStride);
- int sum;
-
- u0 = _mm_mullo_epi32(p[0], prm[2]);
- u1 = _mm_mullo_epi32(p[1], prm[0]);
- u2 = _mm_mullo_epi32(p[2], prm[3]);
-
- u0 = _mm_add_epi32(u0, u1);
- u0 = _mm_add_epi32(u0, u2);
-
- sum = _mm_extract_epi32(u0, 0);
- sum += c0 * x;
- x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
- *(pred + predStride + 1) = x;
-}
-
-static ProducePixelsFunc prodPixelsFuncTab[4] = {
- ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
-};
-
-static void ProducePixels(int *pred, const __m128i *prm, int remain) {
- __m128i p[3];
- const int predStride = (maxBlkSize << 1) + 1;
- int index;
-
- p[0] = _mm_loadu_si128((const __m128i *)pred);
- p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
- p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
-
- if (remain <= 2) {
- return;
- }
- if (remain > 5) {
- index = 3;
- } else {
- index = remain - 3;
- }
- prodPixelsFuncTab[index](p, prm, pred, predStride);
-}
-
-// Note:
-// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
- const int bs, const __m128i *prm, int meanValue,
- uint8_t *dst, ptrdiff_t stride) {
- int pred[33][65];
- int r, c, colBound;
- int remainings;
-
- for (r = 0; r < bs; ++r) {
- pred[r + 1][0] = (int)left[r] - meanValue;
- }
-
- above -= 1;
- for (c = 0; c < 2 * bs + 1; ++c) {
- pred[0][c] = (int)above[c] - meanValue;
- }
-
- r = 0;
- c = 0;
- while (r < bs) {
- colBound = (bs << 1) - r;
- for (c = 0; c < colBound; c += 4) {
- remainings = colBound - c + 1;
- ProducePixels(&pred[r][c], prm, remainings);
- }
- r += 1;
- }
-
- SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
-}
-
-static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
- __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
- int meanValue = 0;
- meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
- GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
-}
-
-void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, DC_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, V_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, H_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D45_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D135_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D117_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D153_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D207_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D63_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
- __m128i prm[5];
- GetIntraFilterParams(bs, TM_PRED, &prm[0]);
- FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-// ============== High Bit Depth ==============
-#if CONFIG_HIGHBITDEPTH
-static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
- const uint16_t *left, const int bd,
- __m128i *params) {
- const __m128i a = _mm_loadu_si128((const __m128i *)above);
- const __m128i l = _mm_loadu_si128((const __m128i *)left);
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint16_t sum_value;
- (void)bd;
-
- sum_vector = _mm_add_epi16(a, l);
-
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
-
- sum_value = _mm_extract_epi16(sum_vector, 0);
- sum_value += 4;
- sum_value >>= 3;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
- const uint16_t *left, const int bd,
- __m128i *params) {
- const __m128i a = _mm_loadu_si128((const __m128i *)above);
- const __m128i l = _mm_loadu_si128((const __m128i *)left);
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint16_t sum_value;
- (void)bd;
-
- sum_vector = _mm_add_epi16(a, l);
-
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
-
- sum_value = _mm_extract_epi16(sum_vector, 0);
- sum_value += 8;
- sum_value >>= 4;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-// Note:
-// Process 16 pixels above and left, 10-bit depth
-// Add to the last 8 pixels sum
-static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
- __m128i *sum) {
- __m128i a = _mm_loadu_si128((const __m128i *)above);
- __m128i l = _mm_loadu_si128((const __m128i *)left);
- sum[0] = _mm_add_epi16(a, l);
- a = _mm_loadu_si128((const __m128i *)(above + 8));
- l = _mm_loadu_si128((const __m128i *)(left + 8));
- sum[0] = _mm_add_epi16(sum[0], a);
- sum[0] = _mm_add_epi16(sum[0], l);
-}
-
-// Note:
-// Process 16 pixels above and left, 12-bit depth
-// Add to the last 8 pixels sum
-static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
- __m128i *sum) {
- __m128i a = _mm_loadu_si128((const __m128i *)above);
- __m128i l = _mm_loadu_si128((const __m128i *)left);
- const __m128i zero = _mm_setzero_si128();
- __m128i v0, v1;
-
- v0 = _mm_unpacklo_epi16(a, zero);
- v1 = _mm_unpacklo_epi16(l, zero);
- sum[0] = _mm_add_epi32(v0, v1);
-
- v0 = _mm_unpackhi_epi16(a, zero);
- v1 = _mm_unpackhi_epi16(l, zero);
- sum[0] = _mm_add_epi32(sum[0], v0);
- sum[0] = _mm_add_epi32(sum[0], v1);
-
- a = _mm_loadu_si128((const __m128i *)(above + 8));
- l = _mm_loadu_si128((const __m128i *)(left + 8));
-
- v0 = _mm_unpacklo_epi16(a, zero);
- v1 = _mm_unpacklo_epi16(l, zero);
- sum[0] = _mm_add_epi32(sum[0], v0);
- sum[0] = _mm_add_epi32(sum[0], v1);
-
- v0 = _mm_unpackhi_epi16(a, zero);
- v1 = _mm_unpackhi_epi16(l, zero);
- sum[0] = _mm_add_epi32(sum[0], v0);
- sum[0] = _mm_add_epi32(sum[0], v1);
-}
-
-static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
- const uint16_t *left, const int bd,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector, u;
- uint32_t sum_value = 0;
-
- if (10 == bd) {
- AddPixels10bit(above, left, &sum_vector);
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
- sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector, 2);
- sum_vector = _mm_add_epi16(sum_vector, u);
- sum_value = _mm_extract_epi16(sum_vector, 0);
- } else if (12 == bd) {
- AddPixels12bit(above, left, &sum_vector);
-
- sum_vector = _mm_hadd_epi32(sum_vector, zero);
- u = _mm_srli_si128(sum_vector, 4);
- sum_vector = _mm_add_epi32(u, sum_vector);
- sum_value = _mm_extract_epi32(sum_vector, 0);
- }
-
- sum_value += 16;
- sum_value >>= 5;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
- const uint16_t *left, const int bd,
- __m128i *params) {
- const __m128i zero = _mm_setzero_si128();
- __m128i sum_vector[2], u;
- uint32_t sum_value = 0;
-
- if (10 == bd) {
- AddPixels10bit(above, left, &sum_vector[0]);
- AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
-
- sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
- sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
- sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
-
- u = _mm_srli_si128(sum_vector[0], 2);
- sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
- sum_value = _mm_extract_epi16(sum_vector[0], 0);
- } else if (12 == bd) {
- AddPixels12bit(above, left, &sum_vector[0]);
- AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
-
- sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
- sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
- u = _mm_srli_si128(sum_vector[0], 4);
- sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
- sum_value = _mm_extract_epi32(sum_vector[0], 0);
- }
-
- sum_value += 32;
- sum_value >>= 6;
- *params = _mm_set1_epi32(sum_value);
- return sum_value;
-}
-
-// Note:
-// params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
- const uint16_t *left, int bs,
- const int bd, __m128i *params) {
- int meanValue = 0;
- switch (bs) {
- case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
- case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
- case 16:
- meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
- break;
- case 32:
- meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
- break;
- default: assert(0);
- }
- return meanValue;
-}
-
-// Note:
-// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void HighbdGeneratePrediction(const uint16_t *above,
- const uint16_t *left, const int bs,
- const int bd, const __m128i *prm,
- int meanValue, uint16_t *dst,
- ptrdiff_t stride) {
- int pred[33][65];
- int r, c, colBound;
- int remainings;
- int ipred;
-
- for (r = 0; r < bs; ++r) {
- pred[r + 1][0] = (int)left[r] - meanValue;
- }
-
- above -= 1;
- for (c = 0; c < 2 * bs + 1; ++c) {
- pred[0][c] = (int)above[c] - meanValue;
- }
-
- r = 0;
- c = 0;
- while (r < bs) {
- colBound = (bs << 1) - r;
- for (c = 0; c < colBound; c += 4) {
- remainings = colBound - c + 1;
- ProducePixels(&pred[r][c], prm, remainings);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ int r, c;
+ uint8_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ // The initialization is just for silencing Jenkins static analysis warnings
+ for (r = 0; r < bh + 1; ++r)
+ memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+ const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+ const __m128i filter_intra_scale_bits =
+ _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+ for (r = 1; r < bh + 1; r += 2) {
+ for (c = 1; c < bw + 1; c += 4) {
+ DECLARE_ALIGNED(16, uint8_t, p[8]);
+ memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+ p[5] = buffer[r][c - 1];
+ p[6] = buffer[r + 1][c - 1];
+ p[7] = 0;
+ const __m128i p_b = xx_loadl_64(p);
+ const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+ const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+ const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+ const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+ const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+ const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+ const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+ const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+ // Rounding
+ const __m128i round_w =
+ _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+ const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+ const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+ // Storing
+ xx_storel_32(&buffer[r][c], out_r);
+ xx_storel_32(&buffer[r + 1][c], out_r1);
}
- r += 1;
}
- for (r = 0; r < bs; ++r) {
- for (c = 0; c < bs; ++c) {
- ipred = pred[r + 1][c + 1] + meanValue;
- dst[c] = clip_pixel_highbd(ipred, bd);
- }
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
dst += stride;
}
}
-
-static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
- int bs, const int bd, __m128i *prm,
- uint16_t *dst, ptrdiff_t stride) {
- int meanValue = 0;
- meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
- HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
-}
-
-void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, DC_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, V_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, H_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D45_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D135_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D117_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D153_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D207_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, D63_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
- int bs, const uint16_t *above,
- const uint16_t *left, int bd) {
- __m128i prm[5];
- GetIntraFilterParams(bs, TM_PRED, &prm[0]);
- HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-#endif // CONFIG_HIGHBITDEPTH
-
-#endif // USE_3TAP_INTRA_FILTER
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 000000000..a34c618d0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_set1_epi16(0);
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+ res_b_round =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+ __m256i s[8];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+ s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+ s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+ s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+ _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+ _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+ _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+ _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)bd;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 000000000..bdf813fa0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+ __m128i s[16];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+ s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+ s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+ s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+ s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+ s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+ s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+ s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+ _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+ _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+ _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+ _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+ _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+ _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+ _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+ _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)bd;
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ __m128i s = _mm_loadl_epi64((__m128i *)src);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ s = _mm_loadl_epi64((__m128i *)src);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 000000000..5d2fc465e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const __m128i offset_const_16b = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 8)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_16bit =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_32b_lo, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 4) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+ const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_lo_round, offset_const);
+
+ if (w < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ } else {
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_hi_round, offset_const);
+
+ if (do_average) {
+ const __m128i data_lo =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_hi =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+ const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+ const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
index 195f0f570..a9cf6a4d6 100644
--- a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -12,375 +12,209 @@
#include <tmmintrin.h>
#include <assert.h>
-#include "./aom_dsp_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
#include "av1/common/convolve.h"
-#if CONFIG_COMPOUND_ROUND
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
+ int im_stride = 8;
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
- const int do_average = conv_params->do_average;
const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_0) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+ __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+ const __m128i round_const_x = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
_mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << conv_params->round_1) >> 1);
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
- }
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
}
}
- }
-}
-#else
-void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
- CONV_BUF_TYPE *dst, int dst_stride, int w,
- int h, InterpFilterParams *filter_params_x,
- InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
- int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
- int i, j;
- const int do_average = conv_params->do_average;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 15-bit intermediate array.
- assert(conv_params->round_0 >= 5);
-
- /* Horizontal filter */
- {
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
-
- for (i = 0; i < im_h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even =
- _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
- }
- }
- }
+ *((uint32_t *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
- /* Vertical filter */
- {
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const = _mm_set1_epi32(
- ((1 << conv_params->round_1) >> 1) -
- (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
- const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const int16_t *data = &im_block[i * im_stride + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
- *(__m128i *)(data + 1 * im_stride));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
- *(__m128i *)(data + 3 * im_stride));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
- *(__m128i *)(data + 5 * im_stride));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
- *(__m128i *)(data + 7 * im_stride));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round =
- _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
- const __m128i res_hi_round =
- _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
- // Accumulate values into the destination buffer
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (do_average) {
- _mm_storeu_si128(p + 0,
- _mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
- _mm_storeu_si128(p + 1,
- _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
- } else {
- _mm_storeu_si128(p + 0, res_lo_round);
- _mm_storeu_si128(p + 1, res_hi_round);
+ *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
}
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
}
}
}
}
-#endif
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
index 0e833e6d9..debb05a6d 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -11,8 +11,9 @@
#include <assert.h>
#include <immintrin.h>
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
#include "av1/common/av1_inv_txfm1d_cfg.h"
// Note:
@@ -85,17 +86,6 @@ static void load_buffer_32x32(const int32_t *coeff, __m256i *in) {
}
}
-static void round_shift_32x32(__m256i *in, int shift) {
- __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
- int i = 0;
-
- while (i < 128) {
- in[i] = _mm256_add_epi32(in[i], rnding);
- in[i] = _mm256_srai_epi32(in[i], shift);
- i++;
- }
-}
-
static __m256i highbd_clamp_epi32(__m256i x, int bd) {
const __m256i zero = _mm256_setzero_si256();
const __m256i one = _mm256_set1_epi16(1);
@@ -120,7 +110,7 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
(void)fliplr;
(void)flipud;
- round_shift_32x32(in, shift);
+ __m256i round = _mm256_set1_epi32((1 << shift) >> 1);
while (i < 128) {
u0 = _mm256_loadu_si256((const __m256i *)output);
@@ -136,6 +126,16 @@ static void write_buffer_32x32(__m256i *in, uint16_t *output, int stride,
v2 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x20);
v3 = _mm256_permute2f128_si256(in[i + 2], in[i + 3], 0x31);
+ v0 = _mm256_add_epi32(v0, round);
+ v1 = _mm256_add_epi32(v1, round);
+ v2 = _mm256_add_epi32(v2, round);
+ v3 = _mm256_add_epi32(v3, round);
+
+ v0 = _mm256_sra_epi32(v0, _mm_cvtsi32_si128(shift));
+ v1 = _mm256_sra_epi32(v1, _mm_cvtsi32_si128(shift));
+ v2 = _mm256_sra_epi32(v2, _mm_cvtsi32_si128(shift));
+ v3 = _mm256_sra_epi32(v3, _mm_cvtsi32_si128(shift));
+
v0 = _mm256_add_epi32(v0, x0);
v1 = _mm256_add_epi32(v1, x1);
v2 = _mm256_add_epi32(v2, x2);
@@ -167,7 +167,53 @@ static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
return x;
}
-static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
+ __m256i *out0, __m256i *out1) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
+ __m256i *out0, __m256i *out1,
+ const __m256i *clamp_lo, const __m256i *clamp_hi,
+ int shift) {
+ __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+ __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
+ __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
+ __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
const int32_t *cospi = cospi_arr(bit);
const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
@@ -220,6 +266,9 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
__m256i bf1[32], bf0[32];
int col;
@@ -334,22 +383,15 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
bf1[15] =
half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
- bf1[16] = _mm256_add_epi32(bf0[16], bf0[17]);
- bf1[17] = _mm256_sub_epi32(bf0[16], bf0[17]);
- bf1[18] = _mm256_sub_epi32(bf0[19], bf0[18]);
- bf1[19] = _mm256_add_epi32(bf0[18], bf0[19]);
- bf1[20] = _mm256_add_epi32(bf0[20], bf0[21]);
- bf1[21] = _mm256_sub_epi32(bf0[20], bf0[21]);
- bf1[22] = _mm256_sub_epi32(bf0[23], bf0[22]);
- bf1[23] = _mm256_add_epi32(bf0[22], bf0[23]);
- bf1[24] = _mm256_add_epi32(bf0[24], bf0[25]);
- bf1[25] = _mm256_sub_epi32(bf0[24], bf0[25]);
- bf1[26] = _mm256_sub_epi32(bf0[27], bf0[26]);
- bf1[27] = _mm256_add_epi32(bf0[26], bf0[27]);
- bf1[28] = _mm256_add_epi32(bf0[28], bf0[29]);
- bf1[29] = _mm256_sub_epi32(bf0[28], bf0[29]);
- bf1[30] = _mm256_sub_epi32(bf0[31], bf0[30]);
- bf1[31] = _mm256_add_epi32(bf0[30], bf0[31]);
+
+ addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
// stage 4
bf0[0] = bf1[0];
@@ -363,14 +405,12 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[6] =
half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
- bf0[8] = _mm256_add_epi32(bf1[8], bf1[9]);
- bf0[9] = _mm256_sub_epi32(bf1[8], bf1[9]);
- bf0[10] = _mm256_sub_epi32(bf1[11], bf1[10]);
- bf0[11] = _mm256_add_epi32(bf1[10], bf1[11]);
- bf0[12] = _mm256_add_epi32(bf1[12], bf1[13]);
- bf0[13] = _mm256_sub_epi32(bf1[12], bf1[13]);
- bf0[14] = _mm256_sub_epi32(bf1[15], bf1[14]);
- bf0[15] = _mm256_add_epi32(bf1[14], bf1[15]);
+
+ addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
bf0[16] = bf1[16];
bf0[17] =
half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
@@ -405,10 +445,8 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
bf1[3] =
half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
- bf1[4] = _mm256_add_epi32(bf0[4], bf0[5]);
- bf1[5] = _mm256_sub_epi32(bf0[4], bf0[5]);
- bf1[6] = _mm256_sub_epi32(bf0[7], bf0[6]);
- bf1[7] = _mm256_add_epi32(bf0[6], bf0[7]);
+ addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
bf1[8] = bf0[8];
bf1[9] =
half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
@@ -421,42 +459,28 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf1[14] =
half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
bf1[15] = bf0[15];
- bf1[16] = _mm256_add_epi32(bf0[16], bf0[19]);
- bf1[17] = _mm256_add_epi32(bf0[17], bf0[18]);
- bf1[18] = _mm256_sub_epi32(bf0[17], bf0[18]);
- bf1[19] = _mm256_sub_epi32(bf0[16], bf0[19]);
- bf1[20] = _mm256_sub_epi32(bf0[23], bf0[20]);
- bf1[21] = _mm256_sub_epi32(bf0[22], bf0[21]);
- bf1[22] = _mm256_add_epi32(bf0[21], bf0[22]);
- bf1[23] = _mm256_add_epi32(bf0[20], bf0[23]);
- bf1[24] = _mm256_add_epi32(bf0[24], bf0[27]);
- bf1[25] = _mm256_add_epi32(bf0[25], bf0[26]);
- bf1[26] = _mm256_sub_epi32(bf0[25], bf0[26]);
- bf1[27] = _mm256_sub_epi32(bf0[24], bf0[27]);
- bf1[28] = _mm256_sub_epi32(bf0[31], bf0[28]);
- bf1[29] = _mm256_sub_epi32(bf0[30], bf0[29]);
- bf1[30] = _mm256_add_epi32(bf0[29], bf0[30]);
- bf1[31] = _mm256_add_epi32(bf0[28], bf0[31]);
+ addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
// stage 6
- bf0[0] = _mm256_add_epi32(bf1[0], bf1[3]);
- bf0[1] = _mm256_add_epi32(bf1[1], bf1[2]);
- bf0[2] = _mm256_sub_epi32(bf1[1], bf1[2]);
- bf0[3] = _mm256_sub_epi32(bf1[0], bf1[3]);
+ addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
bf0[4] = bf1[4];
bf0[5] =
half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
bf0[6] =
half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
bf0[7] = bf1[7];
- bf0[8] = _mm256_add_epi32(bf1[8], bf1[11]);
- bf0[9] = _mm256_add_epi32(bf1[9], bf1[10]);
- bf0[10] = _mm256_sub_epi32(bf1[9], bf1[10]);
- bf0[11] = _mm256_sub_epi32(bf1[8], bf1[11]);
- bf0[12] = _mm256_sub_epi32(bf1[15], bf1[12]);
- bf0[13] = _mm256_sub_epi32(bf1[14], bf1[13]);
- bf0[14] = _mm256_add_epi32(bf1[13], bf1[14]);
- bf0[15] = _mm256_add_epi32(bf1[12], bf1[15]);
+ addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
bf0[16] = bf1[16];
bf0[17] = bf1[17];
bf0[18] =
@@ -483,14 +507,10 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[31] = bf1[31];
// stage 7
- bf1[0] = _mm256_add_epi32(bf0[0], bf0[7]);
- bf1[1] = _mm256_add_epi32(bf0[1], bf0[6]);
- bf1[2] = _mm256_add_epi32(bf0[2], bf0[5]);
- bf1[3] = _mm256_add_epi32(bf0[3], bf0[4]);
- bf1[4] = _mm256_sub_epi32(bf0[3], bf0[4]);
- bf1[5] = _mm256_sub_epi32(bf0[2], bf0[5]);
- bf1[6] = _mm256_sub_epi32(bf0[1], bf0[6]);
- bf1[7] = _mm256_sub_epi32(bf0[0], bf0[7]);
+ addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
bf1[8] = bf0[8];
bf1[9] = bf0[9];
bf1[10] =
@@ -503,40 +523,24 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
bf1[14] = bf0[14];
bf1[15] = bf0[15];
- bf1[16] = _mm256_add_epi32(bf0[16], bf0[23]);
- bf1[17] = _mm256_add_epi32(bf0[17], bf0[22]);
- bf1[18] = _mm256_add_epi32(bf0[18], bf0[21]);
- bf1[19] = _mm256_add_epi32(bf0[19], bf0[20]);
- bf1[20] = _mm256_sub_epi32(bf0[19], bf0[20]);
- bf1[21] = _mm256_sub_epi32(bf0[18], bf0[21]);
- bf1[22] = _mm256_sub_epi32(bf0[17], bf0[22]);
- bf1[23] = _mm256_sub_epi32(bf0[16], bf0[23]);
- bf1[24] = _mm256_sub_epi32(bf0[31], bf0[24]);
- bf1[25] = _mm256_sub_epi32(bf0[30], bf0[25]);
- bf1[26] = _mm256_sub_epi32(bf0[29], bf0[26]);
- bf1[27] = _mm256_sub_epi32(bf0[28], bf0[27]);
- bf1[28] = _mm256_add_epi32(bf0[27], bf0[28]);
- bf1[29] = _mm256_add_epi32(bf0[26], bf0[29]);
- bf1[30] = _mm256_add_epi32(bf0[25], bf0[30]);
- bf1[31] = _mm256_add_epi32(bf0[24], bf0[31]);
+ addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
// stage 8
- bf0[0] = _mm256_add_epi32(bf1[0], bf1[15]);
- bf0[1] = _mm256_add_epi32(bf1[1], bf1[14]);
- bf0[2] = _mm256_add_epi32(bf1[2], bf1[13]);
- bf0[3] = _mm256_add_epi32(bf1[3], bf1[12]);
- bf0[4] = _mm256_add_epi32(bf1[4], bf1[11]);
- bf0[5] = _mm256_add_epi32(bf1[5], bf1[10]);
- bf0[6] = _mm256_add_epi32(bf1[6], bf1[9]);
- bf0[7] = _mm256_add_epi32(bf1[7], bf1[8]);
- bf0[8] = _mm256_sub_epi32(bf1[7], bf1[8]);
- bf0[9] = _mm256_sub_epi32(bf1[6], bf1[9]);
- bf0[10] = _mm256_sub_epi32(bf1[5], bf1[10]);
- bf0[11] = _mm256_sub_epi32(bf1[4], bf1[11]);
- bf0[12] = _mm256_sub_epi32(bf1[3], bf1[12]);
- bf0[13] = _mm256_sub_epi32(bf1[2], bf1[13]);
- bf0[14] = _mm256_sub_epi32(bf1[1], bf1[14]);
- bf0[15] = _mm256_sub_epi32(bf1[0], bf1[15]);
+ addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
bf0[16] = bf1[16];
bf0[17] = bf1[17];
bf0[18] = bf1[18];
@@ -563,58 +567,91 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
bf0[31] = bf1[31];
// stage 9
- out[0 * 4 + col] = _mm256_add_epi32(bf0[0], bf0[31]);
- out[1 * 4 + col] = _mm256_add_epi32(bf0[1], bf0[30]);
- out[2 * 4 + col] = _mm256_add_epi32(bf0[2], bf0[29]);
- out[3 * 4 + col] = _mm256_add_epi32(bf0[3], bf0[28]);
- out[4 * 4 + col] = _mm256_add_epi32(bf0[4], bf0[27]);
- out[5 * 4 + col] = _mm256_add_epi32(bf0[5], bf0[26]);
- out[6 * 4 + col] = _mm256_add_epi32(bf0[6], bf0[25]);
- out[7 * 4 + col] = _mm256_add_epi32(bf0[7], bf0[24]);
- out[8 * 4 + col] = _mm256_add_epi32(bf0[8], bf0[23]);
- out[9 * 4 + col] = _mm256_add_epi32(bf0[9], bf0[22]);
- out[10 * 4 + col] = _mm256_add_epi32(bf0[10], bf0[21]);
- out[11 * 4 + col] = _mm256_add_epi32(bf0[11], bf0[20]);
- out[12 * 4 + col] = _mm256_add_epi32(bf0[12], bf0[19]);
- out[13 * 4 + col] = _mm256_add_epi32(bf0[13], bf0[18]);
- out[14 * 4 + col] = _mm256_add_epi32(bf0[14], bf0[17]);
- out[15 * 4 + col] = _mm256_add_epi32(bf0[15], bf0[16]);
- out[16 * 4 + col] = _mm256_sub_epi32(bf0[15], bf0[16]);
- out[17 * 4 + col] = _mm256_sub_epi32(bf0[14], bf0[17]);
- out[18 * 4 + col] = _mm256_sub_epi32(bf0[13], bf0[18]);
- out[19 * 4 + col] = _mm256_sub_epi32(bf0[12], bf0[19]);
- out[20 * 4 + col] = _mm256_sub_epi32(bf0[11], bf0[20]);
- out[21 * 4 + col] = _mm256_sub_epi32(bf0[10], bf0[21]);
- out[22 * 4 + col] = _mm256_sub_epi32(bf0[9], bf0[22]);
- out[23 * 4 + col] = _mm256_sub_epi32(bf0[8], bf0[23]);
- out[24 * 4 + col] = _mm256_sub_epi32(bf0[7], bf0[24]);
- out[25 * 4 + col] = _mm256_sub_epi32(bf0[6], bf0[25]);
- out[26 * 4 + col] = _mm256_sub_epi32(bf0[5], bf0[26]);
- out[27 * 4 + col] = _mm256_sub_epi32(bf0[4], bf0[27]);
- out[28 * 4 + col] = _mm256_sub_epi32(bf0[3], bf0[28]);
- out[29 * 4 + col] = _mm256_sub_epi32(bf0[2], bf0[29]);
- out[30 * 4 + col] = _mm256_sub_epi32(bf0[1], bf0[30]);
- out[31 * 4 + col] = _mm256_sub_epi32(bf0[0], bf0[31]);
+ if (do_cols) {
+ addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0 * 4 + col,
+ out + 31 * 4 + col);
+ addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1 * 4 + col,
+ out + 30 * 4 + col);
+ addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2 * 4 + col,
+ out + 29 * 4 + col);
+ addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3 * 4 + col,
+ out + 28 * 4 + col);
+ addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4 * 4 + col,
+ out + 27 * 4 + col);
+ addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5 * 4 + col,
+ out + 26 * 4 + col);
+ addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6 * 4 + col,
+ out + 25 * 4 + col);
+ addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7 * 4 + col,
+ out + 24 * 4 + col);
+ addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8 * 4 + col,
+ out + 23 * 4 + col);
+ addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9 * 4 + col,
+ out + 22 * 4 + col);
+ addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+ out + 21 * 4 + col);
+ addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+ out + 20 * 4 + col);
+ addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+ out + 19 * 4 + col);
+ addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+ out + 18 * 4 + col);
+ addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+ out + 17 * 4 + col);
+ addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+ out + 16 * 4 + col);
+ } else {
+ addsub_shift_avx2(bf0[0], bf0[31], out + 0 * 4 + col, out + 31 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[1], bf0[30], out + 1 * 4 + col, out + 30 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[2], bf0[29], out + 2 * 4 + col, out + 29 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[3], bf0[28], out + 3 * 4 + col, out + 28 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[4], bf0[27], out + 4 * 4 + col, out + 27 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[5], bf0[26], out + 5 * 4 + col, out + 26 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[6], bf0[25], out + 6 * 4 + col, out + 25 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[7], bf0[24], out + 7 * 4 + col, out + 24 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[8], bf0[23], out + 8 * 4 + col, out + 23 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[9], bf0[22], out + 9 * 4 + col, out + 22 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[10], bf0[21], out + 10 * 4 + col,
+ out + 21 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[11], bf0[20], out + 11 * 4 + col,
+ out + 20 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[12], bf0[19], out + 12 * 4 + col,
+ out + 19 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[13], bf0[18], out + 13 * 4 + col,
+ out + 18 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[14], bf0[17], out + 14 * 4 + col,
+ out + 17 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_avx2(bf0[15], bf0[16], out + 15 * 4 + col,
+ out + 16 * 4 + col, &clamp_lo, &clamp_hi, out_shift);
+ }
}
}
void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m256i in[128], out[128];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
+ const int txw_idx = get_txw_idx(TX_32X32);
+ const int txh_idx = get_txh_idx(TX_32X32);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_32;
- col_cfg = &inv_txfm_1d_col_cfg_dct_32;
load_buffer_32x32(coeff, in);
transpose_32x32(in, out);
- idct32_avx2(out, in, row_cfg->cos_bit[2]);
- round_shift_32x32(in, -row_cfg->shift[0]);
+ idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
transpose_32x32(in, out);
- idct32_avx2(out, in, col_cfg->cos_bit[2]);
- write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
break;
default: assert(0);
}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
index 8613bed86..801a4133b 100644
--- a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -11,8 +11,9 @@
#include <assert.h>
#include <smmintrin.h> /* SSE4.1 */
-#include "./av1_rtcd.h"
-#include "./aom_config.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
#include "av1/common/av1_inv_txfm1d_cfg.h"
#include "av1/common/x86/highbd_txfm_utility_sse4.h"
@@ -23,13 +24,82 @@ static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
}
-static void idct4x4_sse4_1(__m128i *in, int bit) {
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+ __m128i *out1, const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i in0_w_offset = _mm_add_epi32(in0, offset);
+ __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
+ __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo, const __m128i *clamp_hi,
+ int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i a0 = _mm_add_epi32(offset, in0);
+ __m128i a1 = _mm_sub_epi32(offset, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3, x, y;
@@ -65,84 +135,72 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
- in[0] = _mm_add_epi32(v0, v3);
- in[1] = _mm_add_epi32(v1, v2);
- in[2] = _mm_sub_epi32(v1, v2);
- in[3] = _mm_sub_epi32(v0, v3);
+ addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
}
static void iadst4x4_sse4_1(__m128i *in, int bit) {
- const int32_t *cospi = cospi_arr(bit);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
- const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
- const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
- const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
- const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
- const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
- const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const int32_t *sinpi = sinpi_arr(bit);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const __m128i zero = _mm_setzero_si128();
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
__m128i u0, u1, u2, u3;
- __m128i v0, v1, v2, v3, x, y;
+ __m128i v0, v1, v2, v3;
v0 = _mm_unpacklo_epi32(in[0], in[1]);
v1 = _mm_unpackhi_epi32(in[0], in[1]);
v2 = _mm_unpacklo_epi32(in[2], in[3]);
v3 = _mm_unpackhi_epi32(in[2], in[3]);
- u0 = _mm_unpacklo_epi64(v0, v2);
- u1 = _mm_unpackhi_epi64(v0, v2);
- u2 = _mm_unpacklo_epi64(v1, v3);
- u3 = _mm_unpackhi_epi64(v1, v3);
-
- // stage 0
- // stage 1
- u1 = _mm_sub_epi32(zero, u1);
- u3 = _mm_sub_epi32(zero, u3);
-
- // stage 2
- v0 = u0;
- v1 = u3;
- x = _mm_mullo_epi32(u1, cospi32);
- y = _mm_mullo_epi32(u2, cospi32);
- v2 = _mm_add_epi32(x, y);
- v2 = _mm_add_epi32(v2, rnding);
- v2 = _mm_srai_epi32(v2, bit);
-
- v3 = _mm_sub_epi32(x, y);
- v3 = _mm_add_epi32(v3, rnding);
- v3 = _mm_srai_epi32(v3, bit);
-
- // stage 3
- u0 = _mm_add_epi32(v0, v2);
- u1 = _mm_add_epi32(v1, v3);
- u2 = _mm_sub_epi32(v0, v2);
- u3 = _mm_sub_epi32(v1, v3);
-
- // stage 4
- x = _mm_mullo_epi32(u0, cospi8);
- y = _mm_mullo_epi32(u1, cospi56);
- in[3] = _mm_add_epi32(x, y);
- in[3] = _mm_add_epi32(in[3], rnding);
- in[3] = _mm_srai_epi32(in[3], bit);
-
- x = _mm_mullo_epi32(u0, cospi56);
- y = _mm_mullo_epi32(u1, cospim8);
- in[0] = _mm_add_epi32(x, y);
- in[0] = _mm_add_epi32(in[0], rnding);
- in[0] = _mm_srai_epi32(in[0], bit);
-
- x = _mm_mullo_epi32(u2, cospi40);
- y = _mm_mullo_epi32(u3, cospi24);
- in[1] = _mm_add_epi32(x, y);
- in[1] = _mm_add_epi32(in[1], rnding);
- in[1] = _mm_srai_epi32(in[1], bit);
-
- x = _mm_mullo_epi32(u2, cospi24);
- y = _mm_mullo_epi32(u3, cospim40);
- in[2] = _mm_add_epi32(x, y);
- in[2] = _mm_add_epi32(in[2], rnding);
- in[2] = _mm_srai_epi32(in[2], bit);
+ x0 = _mm_unpacklo_epi64(v0, v2);
+ x1 = _mm_unpackhi_epi64(v0, v2);
+ x2 = _mm_unpacklo_epi64(v1, v3);
+ x3 = _mm_unpackhi_epi64(v1, v3);
+
+ s0 = _mm_mullo_epi32(x0, sinpi1);
+ s1 = _mm_mullo_epi32(x0, sinpi2);
+ s2 = _mm_mullo_epi32(x1, sinpi3);
+ s3 = _mm_mullo_epi32(x2, sinpi4);
+ s4 = _mm_mullo_epi32(x2, sinpi1);
+ s5 = _mm_mullo_epi32(x3, sinpi2);
+ s6 = _mm_mullo_epi32(x3, sinpi4);
+ t = _mm_sub_epi32(x0, x2);
+ s7 = _mm_add_epi32(t, x3);
+
+ t = _mm_add_epi32(s0, s3);
+ s0 = _mm_add_epi32(t, s5);
+ t = _mm_sub_epi32(s1, s4);
+ s1 = _mm_sub_epi32(t, s6);
+ s3 = s2;
+ s2 = _mm_mullo_epi32(s7, sinpi3);
+
+ u0 = _mm_add_epi32(s0, s3);
+ u1 = _mm_add_epi32(s1, s3);
+ u2 = s2;
+ t = _mm_add_epi32(s0, s1);
+ u3 = _mm_sub_epi32(t, s3);
+
+ u0 = _mm_add_epi32(u0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(u1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(u2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(u3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
+
+ in[0] = u0;
+ in[1] = u1;
+ in[2] = u2;
+ in[3] = u3;
}
static INLINE void round_shift_4x4(__m128i *in, int shift) {
@@ -232,84 +290,65 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_4;
- col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
- idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_4;
- col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
- iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
- iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
- write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
-#endif // CONFIG_EXT_TX
default: assert(0);
}
}
@@ -334,7 +373,8 @@ static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
}
-static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
@@ -347,6 +387,9 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
__m128i x, y;
@@ -413,16 +456,12 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
- v4 = _mm_add_epi32(u4, u5);
- v5 = _mm_sub_epi32(u4, u5);
- v6 = _mm_sub_epi32(u7, u6);
- v7 = _mm_add_epi32(u6, u7);
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
// stage 4
- u0 = _mm_add_epi32(v0, v3);
- u1 = _mm_add_epi32(v1, v2);
- u2 = _mm_sub_epi32(v1, v2);
- u3 = _mm_sub_epi32(v0, v3);
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
u4 = v4;
u7 = v7;
@@ -437,195 +476,334 @@ static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
u5 = _mm_srai_epi32(u5, bit);
// stage 5
- out[0 * 2 + col] = _mm_add_epi32(u0, u7);
- out[1 * 2 + col] = _mm_add_epi32(u1, u6);
- out[2 * 2 + col] = _mm_add_epi32(u2, u5);
- out[3 * 2 + col] = _mm_add_epi32(u3, u4);
- out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
- out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
- out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
- out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
+ addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
+ addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
+ addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
+ } else {
+ addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ }
}
}
-static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *cospi = cospi_arr(bit);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
- const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
- const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
- const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
- const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
- const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
- const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
- const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
- const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
- const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const __m128i zero = _mm_setzero_si128();
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i x, y;
- int col;
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
- // Note:
- // Even column: 0, 2, ..., 14
- // Odd column: 1, 3, ..., 15
- // one even column plus one odd column constructs one row (8 coeffs)
- // total we have 8 rows (8x8).
- for (col = 0; col < 2; ++col) {
- // stage 0
- // stage 1
- u0 = in[2 * 0 + col];
- u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
- u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
- u3 = in[2 * 4 + col];
- u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
- u5 = in[2 * 6 + col];
- u6 = in[2 * 2 + col];
- u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
-
- // stage 2
- v0 = u0;
- v1 = u1;
-
- x = _mm_mullo_epi32(u2, cospi32);
- y = _mm_mullo_epi32(u3, cospi32);
- v2 = _mm_add_epi32(x, y);
- v2 = _mm_add_epi32(v2, rnding);
- v2 = _mm_srai_epi32(v2, bit);
-
- v3 = _mm_sub_epi32(x, y);
- v3 = _mm_add_epi32(v3, rnding);
- v3 = _mm_srai_epi32(v3, bit);
-
- v4 = u4;
- v5 = u5;
-
- x = _mm_mullo_epi32(u6, cospi32);
- y = _mm_mullo_epi32(u7, cospi32);
- v6 = _mm_add_epi32(x, y);
- v6 = _mm_add_epi32(v6, rnding);
- v6 = _mm_srai_epi32(v6, bit);
-
- v7 = _mm_sub_epi32(x, y);
- v7 = _mm_add_epi32(v7, rnding);
- v7 = _mm_srai_epi32(v7, bit);
-
- // stage 3
- u0 = _mm_add_epi32(v0, v2);
- u1 = _mm_add_epi32(v1, v3);
- u2 = _mm_sub_epi32(v0, v2);
- u3 = _mm_sub_epi32(v1, v3);
- u4 = _mm_add_epi32(v4, v6);
- u5 = _mm_add_epi32(v5, v7);
- u6 = _mm_sub_epi32(v4, v6);
- u7 = _mm_sub_epi32(v5, v7);
-
- // stage 4
- v0 = u0;
- v1 = u1;
- v2 = u2;
- v3 = u3;
-
- x = _mm_mullo_epi32(u4, cospi16);
- y = _mm_mullo_epi32(u5, cospi48);
- v4 = _mm_add_epi32(x, y);
- v4 = _mm_add_epi32(v4, rnding);
- v4 = _mm_srai_epi32(v4, bit);
-
- x = _mm_mullo_epi32(u4, cospi48);
- y = _mm_mullo_epi32(u5, cospim16);
- v5 = _mm_add_epi32(x, y);
- v5 = _mm_add_epi32(v5, rnding);
- v5 = _mm_srai_epi32(v5, bit);
-
- x = _mm_mullo_epi32(u6, cospim48);
- y = _mm_mullo_epi32(u7, cospi16);
- v6 = _mm_add_epi32(x, y);
- v6 = _mm_add_epi32(v6, rnding);
- v6 = _mm_srai_epi32(v6, bit);
-
- x = _mm_mullo_epi32(u6, cospi16);
- y = _mm_mullo_epi32(u7, cospi48);
- v7 = _mm_add_epi32(x, y);
- v7 = _mm_add_epi32(v7, rnding);
- v7 = _mm_srai_epi32(v7, bit);
-
- // stage 5
- u0 = _mm_add_epi32(v0, v4);
- u1 = _mm_add_epi32(v1, v5);
- u2 = _mm_add_epi32(v2, v6);
- u3 = _mm_add_epi32(v3, v7);
- u4 = _mm_sub_epi32(v0, v4);
- u5 = _mm_sub_epi32(v1, v5);
- u6 = _mm_sub_epi32(v2, v6);
- u7 = _mm_sub_epi32(v3, v7);
-
- // stage 6
- x = _mm_mullo_epi32(u0, cospi4);
- y = _mm_mullo_epi32(u1, cospi60);
- v0 = _mm_add_epi32(x, y);
- v0 = _mm_add_epi32(v0, rnding);
- v0 = _mm_srai_epi32(v0, bit);
+ // Even 8 points: 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[14], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[14], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[10], cospi20);
+ x = _mm_mullo_epi32(in[4], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[10], cospi44);
+ x = _mm_mullo_epi32(in[4], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[6], cospi36);
+ x = _mm_mullo_epi32(in[8], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[6], cospi28);
+ x = _mm_mullo_epi32(in[8], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[2], cospi52);
+ x = _mm_mullo_epi32(in[12], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[2], cospi12);
+ x = _mm_mullo_epi32(in[12], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
- x = _mm_mullo_epi32(u0, cospi60);
- y = _mm_mullo_epi32(u1, cospim4);
- v1 = _mm_add_epi32(x, y);
- v1 = _mm_add_epi32(v1, rnding);
- v1 = _mm_srai_epi32(v1, bit);
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
- x = _mm_mullo_epi32(u2, cospi20);
- y = _mm_mullo_epi32(u3, cospi44);
- v2 = _mm_add_epi32(x, y);
- v2 = _mm_add_epi32(v2, rnding);
- v2 = _mm_srai_epi32(v2, bit);
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[2] = _mm_sub_epi32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = _mm_sub_epi32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = _mm_sub_epi32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo, &clamp_hi,
+ out_shift);
+ }
- x = _mm_mullo_epi32(u2, cospi44);
- y = _mm_mullo_epi32(u3, cospim20);
- v3 = _mm_add_epi32(x, y);
- v3 = _mm_add_epi32(v3, rnding);
- v3 = _mm_srai_epi32(v3, bit);
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[15], cospi4);
+ x = _mm_mullo_epi32(in[1], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[15], cospi60);
+ x = _mm_mullo_epi32(in[1], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[11], cospi20);
+ x = _mm_mullo_epi32(in[5], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[11], cospi44);
+ x = _mm_mullo_epi32(in[5], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[7], cospi36);
+ x = _mm_mullo_epi32(in[9], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[7], cospi28);
+ x = _mm_mullo_epi32(in[9], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[3], cospi52);
+ x = _mm_mullo_epi32(in[13], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[3], cospi12);
+ x = _mm_mullo_epi32(in[13], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
- x = _mm_mullo_epi32(u4, cospi36);
- y = _mm_mullo_epi32(u5, cospi28);
- v4 = _mm_add_epi32(x, y);
- v4 = _mm_add_epi32(v4, rnding);
- v4 = _mm_srai_epi32(v4, bit);
-
- x = _mm_mullo_epi32(u4, cospi28);
- y = _mm_mullo_epi32(u5, cospim36);
- v5 = _mm_add_epi32(x, y);
- v5 = _mm_add_epi32(v5, rnding);
- v5 = _mm_srai_epi32(v5, bit);
-
- x = _mm_mullo_epi32(u6, cospi52);
- y = _mm_mullo_epi32(u7, cospi12);
- v6 = _mm_add_epi32(x, y);
- v6 = _mm_add_epi32(v6, rnding);
- v6 = _mm_srai_epi32(v6, bit);
-
- x = _mm_mullo_epi32(u6, cospi12);
- y = _mm_mullo_epi32(u7, cospim52);
- v7 = _mm_add_epi32(x, y);
- v7 = _mm_add_epi32(v7, rnding);
- v7 = _mm_srai_epi32(v7, bit);
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
- // stage 7
- out[2 * 0 + col] = v1;
- out[2 * 1 + col] = v6;
- out[2 * 2 + col] = v3;
- out[2 * 3 + col] = v4;
- out[2 * 4 + col] = v5;
- out[2 * 5 + col] = v2;
- out[2 * 6 + col] = v7;
- out[2 * 7 + col] = v0;
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[1] = u[0];
+ out[3] = _mm_sub_epi32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = _mm_sub_epi32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = _mm_sub_epi32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo, &clamp_hi,
+ out_shift);
}
}
@@ -708,102 +886,92 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[16], out[16];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_8;
- col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_8;
- col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_8x8(in, out);
- iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
-#endif // CONFIG_EXT_TX
default: assert(0);
}
}
@@ -868,7 +1036,8 @@ static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
}
-static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
@@ -894,6 +1063,9 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
int col;
@@ -945,14 +1117,10 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
- u[8] = _mm_add_epi32(v[8], v[9]);
- u[9] = _mm_sub_epi32(v[8], v[9]);
- u[10] = _mm_sub_epi32(v[11], v[10]);
- u[11] = _mm_add_epi32(v[10], v[11]);
- u[12] = _mm_add_epi32(v[12], v[13]);
- u[13] = _mm_sub_epi32(v[12], v[13]);
- u[14] = _mm_sub_epi32(v[15], v[14]);
- u[15] = _mm_add_epi32(v[14], v[15]);
+ addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
// stage 4
x = _mm_mullo_epi32(u[0], cospi32);
@@ -967,10 +1135,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
- v[4] = _mm_add_epi32(u[4], u[5]);
- v[5] = _mm_sub_epi32(u[4], u[5]);
- v[6] = _mm_sub_epi32(u[7], u[6]);
- v[7] = _mm_add_epi32(u[6], u[7]);
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
v[8] = u[8];
v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
@@ -981,10 +1147,8 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[15] = u[15];
// stage 5
- u[0] = _mm_add_epi32(v[0], v[3]);
- u[1] = _mm_add_epi32(v[1], v[2]);
- u[2] = _mm_sub_epi32(v[1], v[2]);
- u[3] = _mm_sub_epi32(v[0], v[3]);
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
u[4] = v[4];
x = _mm_mullo_epi32(v[5], cospi32);
@@ -998,24 +1162,16 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
u[6] = _mm_srai_epi32(u[6], bit);
u[7] = v[7];
- u[8] = _mm_add_epi32(v[8], v[11]);
- u[9] = _mm_add_epi32(v[9], v[10]);
- u[10] = _mm_sub_epi32(v[9], v[10]);
- u[11] = _mm_sub_epi32(v[8], v[11]);
- u[12] = _mm_sub_epi32(v[15], v[12]);
- u[13] = _mm_sub_epi32(v[14], v[13]);
- u[14] = _mm_add_epi32(v[13], v[14]);
- u[15] = _mm_add_epi32(v[12], v[15]);
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
// stage 6
- v[0] = _mm_add_epi32(u[0], u[7]);
- v[1] = _mm_add_epi32(u[1], u[6]);
- v[2] = _mm_add_epi32(u[2], u[5]);
- v[3] = _mm_add_epi32(u[3], u[4]);
- v[4] = _mm_sub_epi32(u[3], u[4]);
- v[5] = _mm_sub_epi32(u[2], u[5]);
- v[6] = _mm_sub_epi32(u[1], u[6]);
- v[7] = _mm_sub_epi32(u[0], u[7]);
+ addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
v[8] = u[8];
v[9] = u[9];
@@ -1043,386 +1199,1141 @@ static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
v[15] = u[15];
// stage 7
- out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
- out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
- out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
- out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
- out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
- out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
- out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
- out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
- out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
- out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
- out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
- out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
- out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
- out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
- out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
- out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(v[0], v[15], out + 0 * 4 + col,
+ out + 15 * 4 + col);
+ addsub_no_clamp_sse4_1(v[1], v[14], out + 1 * 4 + col,
+ out + 14 * 4 + col);
+ addsub_no_clamp_sse4_1(v[2], v[13], out + 2 * 4 + col,
+ out + 13 * 4 + col);
+ addsub_no_clamp_sse4_1(v[3], v[12], out + 3 * 4 + col,
+ out + 12 * 4 + col);
+ addsub_no_clamp_sse4_1(v[4], v[11], out + 4 * 4 + col,
+ out + 11 * 4 + col);
+ addsub_no_clamp_sse4_1(v[5], v[10], out + 5 * 4 + col,
+ out + 10 * 4 + col);
+ addsub_no_clamp_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col);
+ addsub_no_clamp_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col);
+ } else {
+ addsub_shift_sse4_1(v[0], v[15], out + 0 * 4 + col, out + 15 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[1], v[14], out + 1 * 4 + col, out + 14 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[2], v[13], out + 2 * 4 + col, out + 13 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[3], v[12], out + 3 * 4 + col, out + 12 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[4], v[11], out + 4 * 4 + col, out + 11 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[5], v[10], out + 5 * 4 + col, out + 10 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[6], v[9], out + 6 * 4 + col, out + 9 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ addsub_shift_sse4_1(v[7], v[8], out + 7 * 4 + col, out + 8 * 4 + col,
+ &clamp_lo, &clamp_hi, out_shift);
+ }
}
}
-static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
const int32_t *cospi = cospi_arr(bit);
- const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
- const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
- const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
- const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
- const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
- const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
- const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
- const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
- const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
- const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
- const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
- const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
- const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
- const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
- const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
- const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
- const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
- const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
- const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
- const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
- const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
- const __m128i zero = _mm_setzero_si128();
-
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
__m128i u[16], v[16], x, y;
+ const int col_num = 4;
int col;
- for (col = 0; col < 4; ++col) {
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
// stage 0
// stage 1
- u[0] = in[0 * 4 + col];
- u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
- u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
- u[3] = in[8 * 4 + col];
- u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
- u[5] = in[12 * 4 + col];
- u[6] = in[4 * 4 + col];
- u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
- u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
- u[9] = in[14 * 4 + col];
- u[10] = in[6 * 4 + col];
- u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
- u[12] = in[2 * 4 + col];
- u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
- u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
- u[15] = in[10 * 4 + col];
-
// stage 2
- v[0] = u[0];
- v[1] = u[1];
+ v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+ x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
- x = _mm_mullo_epi32(u[2], cospi32);
- y = _mm_mullo_epi32(u[3], cospi32);
- v[2] = _mm_add_epi32(x, y);
+ v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+ x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+ x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
v[2] = _mm_add_epi32(v[2], rnding);
v[2] = _mm_srai_epi32(v[2], bit);
- v[3] = _mm_sub_epi32(x, y);
+ v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+ x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
v[3] = _mm_add_epi32(v[3], rnding);
v[3] = _mm_srai_epi32(v[3], bit);
- v[4] = u[4];
- v[5] = u[5];
-
- x = _mm_mullo_epi32(u[6], cospi32);
- y = _mm_mullo_epi32(u[7], cospi32);
- v[6] = _mm_add_epi32(x, y);
+ v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+ x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+ x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+ x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
v[6] = _mm_add_epi32(v[6], rnding);
v[6] = _mm_srai_epi32(v[6], bit);
- v[7] = _mm_sub_epi32(x, y);
+ v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+ x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
v[7] = _mm_add_epi32(v[7], rnding);
v[7] = _mm_srai_epi32(v[7], bit);
- v[8] = u[8];
- v[9] = u[9];
-
- x = _mm_mullo_epi32(u[10], cospi32);
- y = _mm_mullo_epi32(u[11], cospi32);
- v[10] = _mm_add_epi32(x, y);
+ v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+ x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+ x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+ x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
v[10] = _mm_add_epi32(v[10], rnding);
v[10] = _mm_srai_epi32(v[10], bit);
- v[11] = _mm_sub_epi32(x, y);
+ v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+ x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
v[11] = _mm_add_epi32(v[11], rnding);
v[11] = _mm_srai_epi32(v[11], bit);
- v[12] = u[12];
- v[13] = u[13];
+ v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+ x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
- x = _mm_mullo_epi32(u[14], cospi32);
- y = _mm_mullo_epi32(u[15], cospi32);
- v[14] = _mm_add_epi32(x, y);
+ v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+ x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+ x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
v[14] = _mm_add_epi32(v[14], rnding);
v[14] = _mm_srai_epi32(v[14], bit);
- v[15] = _mm_sub_epi32(x, y);
+ v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+ x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
v[15] = _mm_add_epi32(v[15], rnding);
v[15] = _mm_srai_epi32(v[15], bit);
// stage 3
- u[0] = _mm_add_epi32(v[0], v[2]);
- u[1] = _mm_add_epi32(v[1], v[3]);
- u[2] = _mm_sub_epi32(v[0], v[2]);
- u[3] = _mm_sub_epi32(v[1], v[3]);
- u[4] = _mm_add_epi32(v[4], v[6]);
- u[5] = _mm_add_epi32(v[5], v[7]);
- u[6] = _mm_sub_epi32(v[4], v[6]);
- u[7] = _mm_sub_epi32(v[5], v[7]);
- u[8] = _mm_add_epi32(v[8], v[10]);
- u[9] = _mm_add_epi32(v[9], v[11]);
- u[10] = _mm_sub_epi32(v[8], v[10]);
- u[11] = _mm_sub_epi32(v[9], v[11]);
- u[12] = _mm_add_epi32(v[12], v[14]);
- u[13] = _mm_add_epi32(v[13], v[15]);
- u[14] = _mm_sub_epi32(v[12], v[14]);
- u[15] = _mm_sub_epi32(v[13], v[15]);
+ addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
// stage 4
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
- v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
- v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
- v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
- v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
- v[8] = u[8];
- v[9] = u[9];
- v[10] = u[10];
- v[11] = u[11];
- v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
- v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
- v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
- v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
// stage 5
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
+ addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
// stage 6
v[0] = u[0];
v[1] = u[1];
v[2] = u[2];
v[3] = u[3];
- v[4] = u[4];
- v[5] = u[5];
- v[6] = u[6];
- v[7] = u[7];
- v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
- v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
- v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
- v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
- v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
- v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
- v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
- v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
// stage 7
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
+ addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
// stage 8
- v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
- v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
- v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
- v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
- v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
- v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
- v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
- v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
- v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
- v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
- v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
- v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
- v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
- v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
- v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
- v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
// stage 9
- out[0 * 4 + col] = v[1];
- out[1 * 4 + col] = v[14];
- out[2 * 4 + col] = v[3];
- out[3 * 4 + col] = v[12];
- out[4 * 4 + col] = v[5];
- out[5 * 4 + col] = v[10];
- out[6 * 4 + col] = v[7];
- out[7 * 4 + col] = v[8];
- out[8 * 4 + col] = v[9];
- out[9 * 4 + col] = v[6];
- out[10 * 4 + col] = v[11];
- out[11 * 4 + col] = v[4];
- out[12 * 4 + col] = v[13];
- out[13 * 4 + col] = v[2];
- out[14 * 4 + col] = v[15];
- out[15 * 4 + col] = v[0];
+ if (do_cols) {
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2 * col_num + col] = v[12];
+ out[3 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4 * col_num + col] = v[6];
+ out[5 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6 * col_num + col] = v[10];
+ out[7 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8 * col_num + col] = v[3];
+ out[9 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10 * col_num + col] = v[15];
+ out[11 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12 * col_num + col] = v[5];
+ out[13 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14 * col_num + col] = v[9];
+ out[15 * col_num + col] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ } else {
+ neg_shift_sse4_1(v[0], v[8], out + 0 * col_num + col,
+ out + 1 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2 * col_num + col,
+ out + 3 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4 * col_num + col,
+ out + 5 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6 * col_num + col,
+ out + 7 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8 * col_num + col,
+ out + 9 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10 * col_num + col,
+ out + 11 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12 * col_num + col,
+ out + 13 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14 * col_num + col,
+ out + 15 * col_num + col, &clamp_lo, &clamp_hi,
+ out_shift);
+ }
}
}
-static void round_shift_16x16(__m128i *in, int shift) {
- round_shift_8x8(&in[0], shift);
- round_shift_8x8(&in[16], shift);
- round_shift_8x8(&in[32], shift);
- round_shift_8x8(&in[48], shift);
-}
-
void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[64], out[64];
- const TXFM_1D_CFG *row_cfg = NULL;
- const TXFM_1D_CFG *col_cfg = NULL;
+ const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
switch (tx_type) {
case DCT_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_16;
- col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
-#if CONFIG_EXT_TX
case FLIPADST_DCT:
- row_cfg = &inv_txfm_1d_row_cfg_dct_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
- row_cfg = &inv_txfm_1d_row_cfg_adst_16;
- col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
- round_shift_16x16(in, -row_cfg->shift[0]);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
transpose_16x16(in, out);
- iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
- write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
+ iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
-#endif
default: assert(0);
}
}
+
+static void load_buffer_64x64_lower_32x32(const int32_t *coeff, __m128i *in) {
+ int i, j;
+
+ __m128i zero = _mm_setzero_si128();
+
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 8; ++j) {
+ in[16 * i + j] =
+ _mm_loadu_si128((const __m128i *)(coeff + 32 * i + 4 * j));
+ in[16 * i + j + 8] = zero;
+ }
+ }
+
+ for (i = 0; i < 512; ++i) in[512 + i] = zero;
+}
+
+static void transpose_64x64(__m128i *in, __m128i *out, int do_cols) {
+ int i, j;
+ for (i = 0; i < (do_cols ? 16 : 8); ++i) {
+ for (j = 0; j < 8; ++j) {
+ TRANSPOSE_4X4(in[(4 * i + 0) * 16 + j], in[(4 * i + 1) * 16 + j],
+ in[(4 * i + 2) * 16 + j], in[(4 * i + 3) * 16 + j],
+ out[(4 * j + 0) * 16 + i], out[(4 * j + 1) * 16 + i],
+ out[(4 * j + 2) * 16 + i], out[(4 * j + 3) * 16 + i]);
+ }
+ }
+}
+
+static void assign_16x16_input_from_32x32(const __m128i *in, __m128i *in16x16,
+ int col) {
+ int i;
+ for (i = 0; i < 16 * 16 / 4; i += 4) {
+ in16x16[i] = in[col];
+ in16x16[i + 1] = in[col + 1];
+ in16x16[i + 2] = in[col + 2];
+ in16x16[i + 3] = in[col + 3];
+ col += 8;
+ }
+}
+
+static void write_buffer_32x32(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i in16x16[16 * 16 / 4];
+ uint16_t *leftUp = &output[0];
+ uint16_t *rightUp = &output[16];
+ uint16_t *leftDown = &output[16 * stride];
+ uint16_t *rightDown = &output[16 * stride + 16];
+
+ if (fliplr) {
+ swap_addr(&leftUp, &rightUp);
+ swap_addr(&leftDown, &rightDown);
+ }
+
+ if (flipud) {
+ swap_addr(&leftUp, &leftDown);
+ swap_addr(&rightUp, &rightDown);
+ }
+
+ // Left-up quarter
+ assign_16x16_input_from_32x32(in, in16x16, 0);
+ write_buffer_16x16(in16x16, leftUp, stride, fliplr, flipud, shift, bd);
+
+ // Right-up quarter
+ assign_16x16_input_from_32x32(in, in16x16, 32 / 2 / 4);
+ write_buffer_16x16(in16x16, rightUp, stride, fliplr, flipud, shift, bd);
+
+ // Left-down quarter
+ assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4);
+ write_buffer_16x16(in16x16, leftDown, stride, fliplr, flipud, shift, bd);
+
+ // Right-down quarter
+ assign_16x16_input_from_32x32(in, in16x16, 32 * 32 / 2 / 4 + 32 / 2 / 4);
+ write_buffer_16x16(in16x16, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void assign_32x32_input_from_64x64(const __m128i *in, __m128i *in32x32,
+ int col) {
+ int i;
+ for (i = 0; i < 32 * 32 / 4; i += 8) {
+ in32x32[i] = in[col];
+ in32x32[i + 1] = in[col + 1];
+ in32x32[i + 2] = in[col + 2];
+ in32x32[i + 3] = in[col + 3];
+ in32x32[i + 4] = in[col + 4];
+ in32x32[i + 5] = in[col + 5];
+ in32x32[i + 6] = in[col + 6];
+ in32x32[i + 7] = in[col + 7];
+ col += 16;
+ }
+}
+
+static void write_buffer_64x64(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i in32x32[32 * 32 / 4];
+ uint16_t *leftUp = &output[0];
+ uint16_t *rightUp = &output[32];
+ uint16_t *leftDown = &output[32 * stride];
+ uint16_t *rightDown = &output[32 * stride + 32];
+
+ if (fliplr) {
+ swap_addr(&leftUp, &rightUp);
+ swap_addr(&leftDown, &rightDown);
+ }
+
+ if (flipud) {
+ swap_addr(&leftUp, &leftDown);
+ swap_addr(&rightUp, &rightDown);
+ }
+
+ // Left-up quarter
+ assign_32x32_input_from_64x64(in, in32x32, 0);
+ write_buffer_32x32(in32x32, leftUp, stride, fliplr, flipud, shift, bd);
+
+ // Right-up quarter
+ assign_32x32_input_from_64x64(in, in32x32, 64 / 2 / 4);
+ write_buffer_32x32(in32x32, rightUp, stride, fliplr, flipud, shift, bd);
+
+ // Left-down quarter
+ assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4);
+ write_buffer_32x32(in32x32, leftDown, stride, fliplr, flipud, shift, bd);
+
+ // Right-down quarter
+ assign_32x32_input_from_64x64(in, in32x32, 64 * 64 / 2 / 4 + 64 / 2 / 4);
+ write_buffer_32x32(in32x32, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ int col;
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ for (col = 0; col < (do_cols ? 64 / 4 : 32 / 4); ++col) {
+ __m128i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1 * 16 + col];
+ u[34] = in[17 * 16 + col];
+ u[36] = in[9 * 16 + col];
+ u[38] = in[25 * 16 + col];
+ u[40] = in[5 * 16 + col];
+ u[42] = in[21 * 16 + col];
+ u[44] = in[13 * 16 + col];
+ u[46] = in[29 * 16 + col];
+ u[48] = in[3 * 16 + col];
+ u[50] = in[19 * 16 + col];
+ u[52] = in[11 * 16 + col];
+ u[54] = in[27 * 16 + col];
+ u[56] = in[7 * 16 + col];
+ u[58] = in[23 * 16 + col];
+ u[60] = in[15 * 16 + col];
+ u[62] = in[31 * 16 + col];
+
+ v[16] = in[2 * 16 + col];
+ v[18] = in[18 * 16 + col];
+ v[20] = in[10 * 16 + col];
+ v[22] = in[26 * 16 + col];
+ v[24] = in[6 * 16 + col];
+ v[26] = in[22 * 16 + col];
+ v[28] = in[14 * 16 + col];
+ v[30] = in[30 * 16 + col];
+
+ u[8] = in[4 * 16 + col];
+ u[10] = in[20 * 16 + col];
+ u[12] = in[12 * 16 + col];
+ u[14] = in[28 * 16 + col];
+
+ v[4] = in[8 * 16 + col];
+ v[6] = in[24 * 16 + col];
+
+ u[0] = in[0 * 16 + col];
+ u[2] = in[16 * 16 + col];
+
+ // stage 2
+ v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ if (do_cols) {
+ for (i = 0; i < 32; i++) {
+ addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+ &out[16 * (63 - i) + col]);
+ }
+ } else {
+ for (i = 0; i < 32; i++) {
+ addsub_shift_sse4_1(v[i], v[63 - i], &out[16 * (i) + col],
+ &out[16 * (63 - i) + col], &clamp_lo, &clamp_hi,
+ out_shift);
+ }
+ }
+ }
+}
+
+void av1_inv_txfm2d_add_64x64_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64 * 64 / 4], out[64 * 64 / 4];
+ const int8_t *shift = inv_txfm_shift_ls[TX_64X64];
+ const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
+ const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_64x64_lower_32x32(coeff, in);
+ transpose_64x64(in, out, 0);
+ idct64x64_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_64x64(in, out, 1);
+ idct64x64_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_64x64(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+
+ default:
+ av1_inv_txfm2d_add_64x64_c(coeff, output, stride, tx_type, bd);
+ break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 000000000..89d0ecb1e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,853 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit =
+ _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ if (do_average) {
+ const __m256i data_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b, offset_const);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_set1_epi16(0);
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ int i, j;
+ __m256i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+ __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+ __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ assert(bits >= 0);
+ int i, j;
+ __m256i s[8], coeffs_y[4];
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i round_const_y =
+ _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+ res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 000000000..ccca6b07a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_jnt_convolve_y_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ assert(bits >= 0);
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+ res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+ round_shift_y);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+ res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+ round_shift_y);
+
+ __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+ __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_0, round_result_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_1, round_result_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+
+ } else {
+ __m128i res_16b_0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+ __m128i res_16b_1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16b_1);
+ }
+ } else {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+ __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+ __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+ const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+ const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_lo_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+ &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_lo_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+ } else {
+ __m128i res_16bit0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+ __m128i res_16bit1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_16bit1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_x_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ int i, j;
+ __m128i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ res_even = _mm_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+ __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+ }
+ } else {
+ __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
index fb246674a..b29bd1d79 100644
--- a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -90,4 +90,14 @@ static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
return x;
}
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *rounding, int bit) {
+ __m128i x;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
#endif // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 000000000..a08beaafd
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
+static INLINE void horizontal_filter(__m128i src, __m128i src2, __m128i *tmp,
+ int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ // Calculate filtered results
+ const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ // Filter odd-index pixels
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
+
+ __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int reduce_bits_horiz =
+ conv_params->round_0 +
+ AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(!(bd == 12 && reduce_bits_horiz < 5));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const __m128i res_sub_const =
+ _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ const __m128i src_01 = _mm_shuffle_epi8(
+ src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+ const __m128i src2_01 = _mm_shuffle_epi8(
+ src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+ __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+ __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+ }
+
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+ }
+
+ const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+ const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+ horizontal_filter(src_padded, src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ horizontal_filter(src, src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo = _mm_add_epi32(res_lo, res_add_const);
+ res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+ if (conv_params->use_jnt_comp_avg) {
+ res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(res_lo, wt1));
+ res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+ } else {
+ res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+ }
+
+ __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+ res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+ res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+ _mm_storel_epi64(dst16, res16_lo);
+ } else {
+ res_lo = _mm_packus_epi32(res_lo, res_lo);
+ _mm_storel_epi64(p, res_lo);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = _mm_add_epi32(res_hi, res_add_const);
+ res_hi =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+ if (conv_params->use_jnt_comp_avg) {
+ res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+ _mm_mullo_epi32(res_hi, wt1));
+ res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+ } else {
+ res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+ }
+
+ __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+ res32_hi = _mm_sra_epi32(
+ _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+ __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+ res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+ _mm_storel_epi64(dst16_4, res16_hi);
+ } else {
+ res_hi = _mm_packus_epi32(res_hi, res_hi);
+ _mm_storel_epi64(p4, res_hi);
+ }
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ _mm_storel_epi64(p, res_16bit);
+ } else {
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
deleted file mode 100644
index 71b0ec7a3..000000000
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
- int width, int height, int stride,
- uint16_t *pred, int p_col, int p_row,
- int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y, int bd,
- ConvolveParams *conv_params, int16_t alpha,
- int16_t beta, int16_t gamma, int16_t delta) {
- int comp_avg = conv_params->do_average;
-#if HORSHEAR_REDUCE_PREC_BITS >= 5
- __m128i tmp[15];
-#else
-#error "HORSHEAR_REDUCE_PREC_BITS < 5 not currently supported by SSSE3 filter"
-#endif
- int i, j, k;
-#if CONFIG_CONVOLVE_ROUND
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
- use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz =
- use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
- if (use_conv_params) {
- conv_params->do_post_rounding = 1;
- }
- assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
- /* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
- */
- /*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
- }*/
-
- for (i = 0; i < p_height; i += 8) {
- for (j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
- const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
- const int32_t x4 = dst_x >> subsampling_x;
- const int32_t y4 = dst_y >> subsampling_y;
-
- int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Add in all the constant terms, including rounding and offset
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
- // Horizontal filter
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- }
- } else if (ix4 >= width + 6) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- }
- } else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src2 =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i round_const = _mm_set1_epi32(
- (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
- // Calculate filtered results
- const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 4), coeff_2);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 8), coeff_4);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 12), coeff_6);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Filter odd-index pixels
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 2), coeff_1);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 6), coeff_3);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 10), coeff_5);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(src2, src, 14), coeff_7);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Combine results into one register.
- // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
- // as this order helps with the vertical filter.
- tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
- if (use_conv_params) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- const __m128i round_const = _mm_set1_epi32(
- -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
- ((1 << (conv_params->round_1)) >> 1));
- res_lo = _mm_add_epi32(res_lo, round_const);
- res_lo =
- _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
- _mm_storeu_si128(p, res_lo);
- if (p_width > 4) {
- res_hi = _mm_add_epi32(res_hi, round_const);
- res_hi =
- _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg)
- res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
- _mm_storeu_si128(p + 1, res_hi);
- }
- } else {
-#else
- {
-#endif
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
- ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
- __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- // Clamp res_16bit to the range [0, 2^bd - 1]
- const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
- const __m128i zero = _mm_setzero_si128();
- res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- if (comp_avg)
- res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
- _mm_storel_epi64(p, res_16bit);
- } else {
- if (comp_avg)
- res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
- _mm_storeu_si128(p, res_16bit);
- }
- }
- }
- }
- }
-}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 000000000..0c8a8505b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const ConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+
+ /* Horizontal filter */
+ {
+ const __m256i clamp_high_ep =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+ // Load 16-bit src data
+ const __m256i src_0 = yy_loadu_256(src_ij + 0);
+ const __m256i src_1 = yy_loadu_256(src_ij + 1);
+ const __m256i src_2 = yy_loadu_256(src_ij + 2);
+ const __m256i src_3 = yy_loadu_256(src_ij + 3);
+ const __m256i src_4 = yy_loadu_256(src_ij + 4);
+ const __m256i src_5 = yy_loadu_256(src_ij + 5);
+ const __m256i src_6 = yy_loadu_256(src_ij + 6);
+ const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_16bit_clamped = _mm256_min_epi16(
+ _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+ // Store in the dst array
+ yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 000000000..818b1099c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const ConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ const __m128i maxval =
+ _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
deleted file mode 100644
index c440d0f88..000000000
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h> // avx2
-
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
-
-#include "aom_dsp/x86/inv_txfm_common_avx2.h"
-
-void av1_idct16_avx2(__m256i *in) {
- const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
- const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
- const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
- const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
- const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
- const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
- const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
- const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
- const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
- const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
- const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
- const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
- const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
- const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
- const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
- const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
- const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
- const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
- const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
- __m256i u0, u1, u2, u3, u4, u5, u6, u7;
- __m256i v0, v1, v2, v3, v4, v5, v6, v7;
- __m256i t0, t1, t2, t3, t4, t5, t6, t7;
-
- // stage 1, (0-7)
- u0 = in[0];
- u1 = in[8];
- u2 = in[4];
- u3 = in[12];
- u4 = in[2];
- u5 = in[10];
- u6 = in[6];
- u7 = in[14];
-
- // stage 2, (0-7)
- // stage 3, (0-7)
- t0 = u0;
- t1 = u1;
- t2 = u2;
- t3 = u3;
- unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
- unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
-
- // stage 4, (0-7)
- unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
- unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
- u4 = _mm256_add_epi16(t4, t5);
- u5 = _mm256_sub_epi16(t4, t5);
- u6 = _mm256_sub_epi16(t7, t6);
- u7 = _mm256_add_epi16(t7, t6);
-
- // stage 5, (0-7)
- t0 = _mm256_add_epi16(u0, u3);
- t1 = _mm256_add_epi16(u1, u2);
- t2 = _mm256_sub_epi16(u1, u2);
- t3 = _mm256_sub_epi16(u0, u3);
- t4 = u4;
- t7 = u7;
- unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
-
- // stage 6, (0-7)
- u0 = _mm256_add_epi16(t0, t7);
- u1 = _mm256_add_epi16(t1, t6);
- u2 = _mm256_add_epi16(t2, t5);
- u3 = _mm256_add_epi16(t3, t4);
- u4 = _mm256_sub_epi16(t3, t4);
- u5 = _mm256_sub_epi16(t2, t5);
- u6 = _mm256_sub_epi16(t1, t6);
- u7 = _mm256_sub_epi16(t0, t7);
-
- // stage 1, (8-15)
- v0 = in[1];
- v1 = in[9];
- v2 = in[5];
- v3 = in[13];
- v4 = in[3];
- v5 = in[11];
- v6 = in[7];
- v7 = in[15];
-
- // stage 2, (8-15)
- unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
- unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
- unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
- unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
-
- // stage 3, (8-15)
- v0 = _mm256_add_epi16(t0, t1);
- v1 = _mm256_sub_epi16(t0, t1);
- v2 = _mm256_sub_epi16(t3, t2);
- v3 = _mm256_add_epi16(t2, t3);
- v4 = _mm256_add_epi16(t4, t5);
- v5 = _mm256_sub_epi16(t4, t5);
- v6 = _mm256_sub_epi16(t7, t6);
- v7 = _mm256_add_epi16(t6, t7);
-
- // stage 4, (8-15)
- t0 = v0;
- t7 = v7;
- t3 = v3;
- t4 = v4;
- unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
- unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
-
- // stage 5, (8-15)
- v0 = _mm256_add_epi16(t0, t3);
- v1 = _mm256_add_epi16(t1, t2);
- v2 = _mm256_sub_epi16(t1, t2);
- v3 = _mm256_sub_epi16(t0, t3);
- v4 = _mm256_sub_epi16(t7, t4);
- v5 = _mm256_sub_epi16(t6, t5);
- v6 = _mm256_add_epi16(t6, t5);
- v7 = _mm256_add_epi16(t7, t4);
-
- // stage 6, (8-15)
- t0 = v0;
- t1 = v1;
- t6 = v6;
- t7 = v7;
- unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
- unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
-
- // stage 7
- in[0] = _mm256_add_epi16(u0, t7);
- in[1] = _mm256_add_epi16(u1, t6);
- in[2] = _mm256_add_epi16(u2, t5);
- in[3] = _mm256_add_epi16(u3, t4);
- in[4] = _mm256_add_epi16(u4, t3);
- in[5] = _mm256_add_epi16(u5, t2);
- in[6] = _mm256_add_epi16(u6, t1);
- in[7] = _mm256_add_epi16(u7, t0);
- in[8] = _mm256_sub_epi16(u7, t0);
- in[9] = _mm256_sub_epi16(u6, t1);
- in[10] = _mm256_sub_epi16(u5, t2);
- in[11] = _mm256_sub_epi16(u4, t3);
- in[12] = _mm256_sub_epi16(u3, t4);
- in[13] = _mm256_sub_epi16(u2, t5);
- in[14] = _mm256_sub_epi16(u1, t6);
- in[15] = _mm256_sub_epi16(u0, t7);
-}
-
-static void idct16(__m256i *in) {
- mm256_transpose_16x16(in, in);
- av1_idct16_avx2(in);
-}
-
-static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
- const __m256i *c0, const __m256i *c1,
- __m256i *b) {
- __m256i x0, x1;
- x0 = _mm256_unpacklo_epi16(*a0, *a1);
- x1 = _mm256_unpackhi_epi16(*a0, *a1);
- b[0] = _mm256_madd_epi16(x0, *c0);
- b[1] = _mm256_madd_epi16(x1, *c0);
- b[2] = _mm256_madd_epi16(x0, *c1);
- b[3] = _mm256_madd_epi16(x1, *c1);
-}
-
-static INLINE void group_rounding(__m256i *a, int num) {
- const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- int i;
- for (i = 0; i < num; ++i) {
- a[i] = _mm256_add_epi32(a[i], dct_rounding);
- a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
- }
-}
-
-static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
- __m256i x[4];
- x[0] = _mm256_add_epi32(a[0], b[0]);
- x[1] = _mm256_add_epi32(a[1], b[1]);
- x[2] = _mm256_add_epi32(a[2], b[2]);
- x[3] = _mm256_add_epi32(a[3], b[3]);
-
- group_rounding(x, 4);
-
- out[0] = _mm256_packs_epi32(x[0], x[1]);
- out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
- __m256i x[4];
- x[0] = _mm256_sub_epi32(a[0], b[0]);
- x[1] = _mm256_sub_epi32(a[1], b[1]);
- x[2] = _mm256_sub_epi32(a[2], b[2]);
- x[3] = _mm256_sub_epi32(a[3], b[3]);
-
- group_rounding(x, 4);
-
- out[0] = _mm256_packs_epi32(x[0], x[1]);
- out[1] = _mm256_packs_epi32(x[2], x[3]);
-}
-
-static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
- group_rounding(a, 4);
- out[0] = _mm256_packs_epi32(a[0], a[1]);
- out[1] = _mm256_packs_epi32(a[2], a[3]);
-}
-
-static void iadst16_avx2(__m256i *in) {
- const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
- const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
- const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
- const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
- const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
- const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
- const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
- const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
- const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
- const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
- const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
- const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
- const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
- const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
- const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
- const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
- const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
- const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
- const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
- const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
- const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
- const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
- const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
- const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
- const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
- const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
- const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
- const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
- const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
- const __m256i zero = _mm256_setzero_si256();
- __m256i x[16], s[16];
- __m256i u[4], v[4];
-
- // stage 1
- butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
- butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
- add_rnd(u, v, &x[0]);
- sub_rnd(u, v, &x[8]);
-
- butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
- butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
- add_rnd(u, v, &x[2]);
- sub_rnd(u, v, &x[10]);
-
- butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
- butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
- add_rnd(u, v, &x[4]);
- sub_rnd(u, v, &x[12]);
-
- butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
- butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
- add_rnd(u, v, &x[6]);
- sub_rnd(u, v, &x[14]);
-
- // stage 2
- s[0] = _mm256_add_epi16(x[0], x[4]);
- s[1] = _mm256_add_epi16(x[1], x[5]);
- s[2] = _mm256_add_epi16(x[2], x[6]);
- s[3] = _mm256_add_epi16(x[3], x[7]);
- s[4] = _mm256_sub_epi16(x[0], x[4]);
- s[5] = _mm256_sub_epi16(x[1], x[5]);
- s[6] = _mm256_sub_epi16(x[2], x[6]);
- s[7] = _mm256_sub_epi16(x[3], x[7]);
- butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
- butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
- add_rnd(u, v, &s[8]);
- sub_rnd(u, v, &s[12]);
-
- butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
- butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
- add_rnd(u, v, &s[10]);
- sub_rnd(u, v, &s[14]);
-
- // stage 3
- x[0] = _mm256_add_epi16(s[0], s[2]);
- x[1] = _mm256_add_epi16(s[1], s[3]);
- x[2] = _mm256_sub_epi16(s[0], s[2]);
- x[3] = _mm256_sub_epi16(s[1], s[3]);
-
- x[8] = _mm256_add_epi16(s[8], s[10]);
- x[9] = _mm256_add_epi16(s[9], s[11]);
- x[10] = _mm256_sub_epi16(s[8], s[10]);
- x[11] = _mm256_sub_epi16(s[9], s[11]);
-
- butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
- butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
- add_rnd(u, v, &x[4]);
- sub_rnd(u, v, &x[6]);
-
- butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
- butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
- add_rnd(u, v, &x[12]);
- sub_rnd(u, v, &x[14]);
-
- // stage 4
- butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
- butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
- butterfly_rnd(u, &x[2]);
- butterfly_rnd(v, &x[6]);
-
- butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
- butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
- butterfly_rnd(u, &x[10]);
- butterfly_rnd(v, &x[14]);
-
- in[0] = x[0];
- in[1] = _mm256_sub_epi16(zero, x[8]);
- in[2] = x[12];
- in[3] = _mm256_sub_epi16(zero, x[4]);
- in[4] = x[6];
- in[5] = x[14];
- in[6] = x[10];
- in[7] = x[2];
- in[8] = x[3];
- in[9] = x[11];
- in[10] = x[15];
- in[11] = x[7];
- in[12] = x[5];
- in[13] = _mm256_sub_epi16(zero, x[13]);
- in[14] = x[9];
- in[15] = _mm256_sub_epi16(zero, x[1]);
-}
-
-static void iadst16(__m256i *in) {
- mm256_transpose_16x16(in, in);
- iadst16_avx2(in);
-}
-
-#if CONFIG_EXT_TX
-static void flip_row(__m256i *in, int rows) {
- int i;
- for (i = 0; i < rows; ++i) {
- mm256_reverse_epi16(&in[i]);
- }
-}
-
-static void flip_col(uint8_t **dest, int *stride, int rows) {
- *dest = *dest + (rows - 1) * (*stride);
- *stride = -*stride;
-}
-
-static void iidtx16(__m256i *in) {
- mm256_transpose_16x16(in, in);
- txfm_scaling16_avx2((int16_t)Sqrt2, in);
-}
-#endif
-
-void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m256i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- load_buffer_16x16(input, in);
- switch (tx_type) {
- case DCT_DCT:
- idct16(in);
- idct16(in);
- break;
- case ADST_DCT:
- idct16(in);
- iadst16(in);
- break;
- case DCT_ADST:
- iadst16(in);
- idct16(in);
- break;
- case ADST_ADST:
- iadst16(in);
- iadst16(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- idct16(in);
- iadst16(in);
- flip_col(&dest, &stride, 16);
- break;
- case DCT_FLIPADST:
- iadst16(in);
- idct16(in);
- flip_row(in, 16);
- break;
- case FLIPADST_FLIPADST:
- iadst16(in);
- iadst16(in);
- flip_row(in, 16);
- flip_col(&dest, &stride, 16);
- break;
- case ADST_FLIPADST:
- iadst16(in);
- iadst16(in);
- flip_row(in, 16);
- break;
- case FLIPADST_ADST:
- iadst16(in);
- iadst16(in);
- flip_col(&dest, &stride, 16);
- break;
- case IDTX:
- iidtx16(in);
- iidtx16(in);
- break;
- case V_DCT:
- iidtx16(in);
- idct16(in);
- break;
- case H_DCT:
- idct16(in);
- iidtx16(in);
- break;
- case V_ADST:
- iidtx16(in);
- iadst16(in);
- break;
- case H_ADST:
- iadst16(in);
- iidtx16(in);
- break;
- case V_FLIPADST:
- iidtx16(in);
- iadst16(in);
- flip_col(&dest, &stride, 16);
- break;
- case H_FLIPADST:
- iadst16(in);
- iidtx16(in);
- flip_row(in, 16);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
- store_buffer_16xN(in, stride, dest, 16);
-}
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
deleted file mode 100644
index 541165c8d..000000000
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ /dev/null
@@ -1,1411 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "aom_dsp/x86/inv_txfm_sse2.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-#include "av1/common/enums.h"
-
-#if CONFIG_EXT_TX
-static INLINE void fliplr_4x4(__m128i *in /*in[2]*/) {
- in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
- in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
- in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
- in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
-}
-
-static INLINE void fliplr_8x8(__m128i *in /*in[8]*/) {
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
-
- in[4] = mm_reverse_epi16(in[4]);
- in[5] = mm_reverse_epi16(in[5]);
- in[6] = mm_reverse_epi16(in[6]);
- in[7] = mm_reverse_epi16(in[7]);
-}
-
-static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
- fliplr_8x8(&in[0]);
- fliplr_8x8(&in[8]);
-}
-
-#define FLIPLR_16x16(in0, in1) \
- do { \
- __m128i *tmp; \
- fliplr_16x8(in0); \
- fliplr_16x8(in1); \
- tmp = (in0); \
- (in0) = (in1); \
- (in1) = tmp; \
- } while (0)
-
-#define FLIPUD_PTR(dest, stride, size) \
- do { \
- (dest) = (dest) + ((size)-1) * (stride); \
- (stride) = -(stride); \
- } while (0)
-#endif
-
-void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- __m128i in[2];
- const __m128i zero = _mm_setzero_si128();
- const __m128i eight = _mm_set1_epi16(8);
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
-
- switch (tx_type) {
- case DCT_DCT:
- aom_idct4_sse2(in);
- aom_idct4_sse2(in);
- break;
- case ADST_DCT:
- aom_idct4_sse2(in);
- aom_iadst4_sse2(in);
- break;
- case DCT_ADST:
- aom_iadst4_sse2(in);
- aom_idct4_sse2(in);
- break;
- case ADST_ADST:
- aom_iadst4_sse2(in);
- aom_iadst4_sse2(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- aom_idct4_sse2(in);
- aom_iadst4_sse2(in);
- FLIPUD_PTR(dest, stride, 4);
- break;
- case DCT_FLIPADST:
- aom_iadst4_sse2(in);
- aom_idct4_sse2(in);
- fliplr_4x4(in);
- break;
- case FLIPADST_FLIPADST:
- aom_iadst4_sse2(in);
- aom_iadst4_sse2(in);
- FLIPUD_PTR(dest, stride, 4);
- fliplr_4x4(in);
- break;
- case ADST_FLIPADST:
- aom_iadst4_sse2(in);
- aom_iadst4_sse2(in);
- fliplr_4x4(in);
- break;
- case FLIPADST_ADST:
- aom_iadst4_sse2(in);
- aom_iadst4_sse2(in);
- FLIPUD_PTR(dest, stride, 4);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-
- // Final round and shift
- in[0] = _mm_add_epi16(in[0], eight);
- in[1] = _mm_add_epi16(in[1], eight);
-
- in[0] = _mm_srai_epi16(in[0], 4);
- in[1] = _mm_srai_epi16(in[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
- __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
- d0 = _mm_unpacklo_epi32(d0, d1);
- d2 = _mm_unpacklo_epi32(d2, d3);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, in[0]);
- d2 = _mm_add_epi16(d2, in[1]);
- d0 = _mm_packus_epi16(d0, d2);
- // store result[0]
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store result[1]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store result[2]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- // store result[3]
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- }
-}
-
-void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- __m128i in[8];
- const __m128i zero = _mm_setzero_si128();
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- // load input data
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
- in[4] = load_input_data(input + 8 * 4);
- in[5] = load_input_data(input + 8 * 5);
- in[6] = load_input_data(input + 8 * 6);
- in[7] = load_input_data(input + 8 * 7);
-
- switch (tx_type) {
- case DCT_DCT:
- aom_idct8_sse2(in);
- aom_idct8_sse2(in);
- break;
- case ADST_DCT:
- aom_idct8_sse2(in);
- aom_iadst8_sse2(in);
- break;
- case DCT_ADST:
- aom_iadst8_sse2(in);
- aom_idct8_sse2(in);
- break;
- case ADST_ADST:
- aom_iadst8_sse2(in);
- aom_iadst8_sse2(in);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- aom_idct8_sse2(in);
- aom_iadst8_sse2(in);
- FLIPUD_PTR(dest, stride, 8);
- break;
- case DCT_FLIPADST:
- aom_iadst8_sse2(in);
- aom_idct8_sse2(in);
- fliplr_8x8(in);
- break;
- case FLIPADST_FLIPADST:
- aom_iadst8_sse2(in);
- aom_iadst8_sse2(in);
- FLIPUD_PTR(dest, stride, 8);
- fliplr_8x8(in);
- break;
- case ADST_FLIPADST:
- aom_iadst8_sse2(in);
- aom_iadst8_sse2(in);
- fliplr_8x8(in);
- break;
- case FLIPADST_ADST:
- aom_iadst8_sse2(in);
- aom_iadst8_sse2(in);
- FLIPUD_PTR(dest, stride, 8);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 5);
- in[1] = _mm_srai_epi16(in[1], 5);
- in[2] = _mm_srai_epi16(in[2], 5);
- in[3] = _mm_srai_epi16(in[3], 5);
- in[4] = _mm_srai_epi16(in[4], 5);
- in[5] = _mm_srai_epi16(in[5], 5);
- in[6] = _mm_srai_epi16(in[6], 5);
- in[7] = _mm_srai_epi16(in[7], 5);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- idtx16_8col(in0);
- idtx16_8col(in1);
-}
-#endif // CONFIG_EXT_TX
-
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m128i in[32];
- __m128i *in0 = &in[0];
- __m128i *in1 = &in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- load_buffer_8x16(input, in0);
- input += 8;
- load_buffer_8x16(input, in1);
-
- switch (tx_type) {
- case DCT_DCT:
- aom_idct16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- break;
- case ADST_DCT:
- aom_idct16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- break;
- case DCT_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- break;
- case ADST_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- aom_idct16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
- case DCT_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- FLIPLR_16x16(in0, in1);
- break;
- case ADST_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_ADST:
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
- case IDTX:
- iidtx16_sse2(in0, in1);
- iidtx16_sse2(in0, in1);
- break;
- case V_DCT:
- iidtx16_sse2(in0, in1);
- aom_idct16_sse2(in0, in1);
- break;
- case H_DCT:
- aom_idct16_sse2(in0, in1);
- iidtx16_sse2(in0, in1);
- break;
- case V_ADST:
- iidtx16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- break;
- case H_ADST:
- aom_iadst16_sse2(in0, in1);
- iidtx16_sse2(in0, in1);
- break;
- case V_FLIPADST:
- iidtx16_sse2(in0, in1);
- aom_iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
- case H_FLIPADST:
- aom_iadst16_sse2(in0, in1);
- iidtx16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-
- write_buffer_8x16(dest, in0, stride);
- dest += 8;
- write_buffer_8x16(dest, in1, stride);
-}
-
-#if CONFIG_EXT_TX
-static void iidtx8_sse2(__m128i *in) {
- in[0] = _mm_slli_epi16(in[0], 1);
- in[1] = _mm_slli_epi16(in[1], 1);
- in[2] = _mm_slli_epi16(in[2], 1);
- in[3] = _mm_slli_epi16(in[3], 1);
- in[4] = _mm_slli_epi16(in[4], 1);
- in[5] = _mm_slli_epi16(in[5], 1);
- in[6] = _mm_slli_epi16(in[6], 1);
- in[7] = _mm_slli_epi16(in[7], 1);
-}
-
-static INLINE void iidtx4_sse2(__m128i *in) {
- const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
-
- const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
- const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
- const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
- const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
-
- const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
- const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
- const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
- const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
-
- in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
- xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
- in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
- xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
-}
-
-// load 8x8 array
-static INLINE void flip_buffer_lr_8x8(__m128i *in) {
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
- in[4] = mm_reverse_epi16(in[4]);
- in[5] = mm_reverse_epi16(in[5]);
- in[6] = mm_reverse_epi16(in[6]);
- in[7] = mm_reverse_epi16(in[7]);
-}
-#endif // CONFIG_EXT_TX
-
-void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m128i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- in[0] = load_input_data(input + 0 * 8);
- in[1] = load_input_data(input + 1 * 8);
- in[2] = load_input_data(input + 2 * 8);
- in[3] = load_input_data(input + 3 * 8);
- in[4] = load_input_data(input + 4 * 8);
- in[5] = load_input_data(input + 5 * 8);
- in[6] = load_input_data(input + 6 * 8);
- in[7] = load_input_data(input + 7 * 8);
-
- in[8] = load_input_data(input + 8 * 8);
- in[9] = load_input_data(input + 9 * 8);
- in[10] = load_input_data(input + 10 * 8);
- in[11] = load_input_data(input + 11 * 8);
- in[12] = load_input_data(input + 12 * 8);
- in[13] = load_input_data(input + 13 * 8);
- in[14] = load_input_data(input + 14 * 8);
- in[15] = load_input_data(input + 15 * 8);
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- aom_idct8_sse2(in);
- array_transpose_8x8(in, in);
- aom_idct8_sse2(in + 8);
- array_transpose_8x8(in + 8, in + 8);
- break;
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST:
-#endif
- aom_iadst8_sse2(in);
- array_transpose_8x8(in, in);
- aom_iadst8_sse2(in + 8);
- array_transpose_8x8(in + 8, in + 8);
- break;
-#if CONFIG_EXT_TX
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
- iidtx8_sse2(in);
- iidtx8_sse2(in + 8);
- break;
-#endif
- default: assert(0); break;
- }
- scale_sqrt2_8x8(in);
- scale_sqrt2_8x8(in + 8);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- idct16_8col(in);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- iadst16_8col(in);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX: idtx16_8col(in); break;
-#endif
- default: assert(0); break;
- }
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case H_DCT:
-#endif
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
-#endif
- write_buffer_8x16(dest, in, stride);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- flip_buffer_lr_8x8(in);
- flip_buffer_lr_8x8(in + 8);
- write_buffer_8x16(dest, in, stride);
- break;
- case FLIPADST_FLIPADST:
- flip_buffer_lr_8x8(in);
- flip_buffer_lr_8x8(in + 8);
- write_buffer_8x16(dest + stride * 15, in, -stride);
- break;
-#endif
- default: assert(0); break;
- }
-}
-
-static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
-}
-
-void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m128i in[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- // Transpose 16x8 input into in[]
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
- array_transpose_8x8(in, in);
-
- in[8] = load_input_data(input + 8 + 0 * 16);
- in[9] = load_input_data(input + 8 + 1 * 16);
- in[10] = load_input_data(input + 8 + 2 * 16);
- in[11] = load_input_data(input + 8 + 3 * 16);
- in[12] = load_input_data(input + 8 + 4 * 16);
- in[13] = load_input_data(input + 8 + 5 * 16);
- in[14] = load_input_data(input + 8 + 6 * 16);
- in[15] = load_input_data(input + 8 + 7 * 16);
- array_transpose_8x8(in + 8, in + 8);
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- idct16_8col(in);
- break;
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST:
-#endif
- iadst16_8col(in);
- break;
-#if CONFIG_EXT_TX
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX: idtx16_8col(in); break;
-#endif
- default: assert(0); break;
- }
-
- // Scale
- scale_sqrt2_8x8(in);
- scale_sqrt2_8x8(in + 8);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- aom_idct8_sse2(in);
- aom_idct8_sse2(in + 8);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- aom_iadst8_sse2(in);
- aom_iadst8_sse2(in + 8);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX:
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- iidtx8_sse2(in);
- iidtx8_sse2(in + 8);
- break;
-#endif
- default: assert(0); break;
- }
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
-#endif
- write_buffer_8x8_round6(dest, in, stride);
- write_buffer_8x8_round6(dest + 8, in + 8, stride);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST:
- write_buffer_8x8_round6(dest + stride * 7, in, -stride);
- write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
- break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- flip_buffer_lr_8x8(in);
- flip_buffer_lr_8x8(in + 8);
- write_buffer_8x8_round6(dest, in + 8, stride);
- write_buffer_8x8_round6(dest + 8, in, stride);
- break;
- case FLIPADST_FLIPADST:
- flip_buffer_lr_8x8(in);
- flip_buffer_lr_8x8(in + 8);
- write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
- write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
- break;
-#endif
- default: assert(0); break;
- }
-}
-
-static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 5);
- in[1] = _mm_srai_epi16(in[1], 5);
- in[2] = _mm_srai_epi16(in[2], 5);
- in[3] = _mm_srai_epi16(in[3], 5);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
-}
-
-void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- __m128i in[8];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- in[0] = load_input_data(input + 0 * 8);
- in[1] = load_input_data(input + 1 * 8);
- in[2] = load_input_data(input + 2 * 8);
- in[3] = load_input_data(input + 3 * 8);
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- aom_idct8_sse2(in);
- break;
- case DCT_ADST:
- case ADST_ADST: aom_iadst8_sse2(in); break;
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST: aom_iadst8_sse2(in); break;
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
-#endif
- break;
- default: assert(0); break;
- }
-
- scale_sqrt2_8x8(in);
-
- // Repack data. We pack into the bottom half of 'in'
- // so that the next repacking stage can pack into the
- // top half without overwriting anything
- in[7] = _mm_unpacklo_epi64(in[6], in[7]);
- in[6] = _mm_unpacklo_epi64(in[4], in[5]);
- in[5] = _mm_unpacklo_epi64(in[2], in[3]);
- in[4] = _mm_unpacklo_epi64(in[0], in[1]);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- aom_idct4_sse2(in + 4);
- aom_idct4_sse2(in + 6);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- aom_iadst4_sse2(in + 4);
- aom_iadst4_sse2(in + 6);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX:
- iidtx4_sse2(in + 4);
- array_transpose_4x4(in + 4);
- iidtx4_sse2(in + 6);
- array_transpose_4x4(in + 6);
- break;
-#endif
- default: assert(0); break;
- }
-
- // Repack data
- in[0] = _mm_unpacklo_epi64(in[4], in[6]);
- in[1] = _mm_unpackhi_epi64(in[4], in[6]);
- in[2] = _mm_unpacklo_epi64(in[5], in[7]);
- in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX: break;
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
- break;
- case FLIPADST_FLIPADST:
- in[0] = mm_reverse_epi16(in[0]);
- in[1] = mm_reverse_epi16(in[1]);
- in[2] = mm_reverse_epi16(in[2]);
- in[3] = mm_reverse_epi16(in[3]);
- FLIPUD_PTR(dest, stride, 4);
-#endif
- break;
- default: assert(0); break;
- }
- write_buffer_8x4_round5(dest, in, stride);
-}
-
-static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 5);
- in[1] = _mm_srai_epi16(in[1], 5);
- in[2] = _mm_srai_epi16(in[2], 5);
- in[3] = _mm_srai_epi16(in[3], 5);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
- __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
- __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
- __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
- __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
- __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
-
- d0 = _mm_unpacklo_epi32(d0, d1);
- d2 = _mm_unpacklo_epi32(d2, d3);
- d4 = _mm_unpacklo_epi32(d4, d5);
- d6 = _mm_unpacklo_epi32(d6, d7);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d4 = _mm_unpacklo_epi8(d4, zero);
- d6 = _mm_unpacklo_epi8(d6, zero);
- d0 = _mm_add_epi16(d0, in[0]);
- d2 = _mm_add_epi16(d2, in[1]);
- d4 = _mm_add_epi16(d4, in[2]);
- d6 = _mm_add_epi16(d6, in[3]);
-
- d0 = _mm_packus_epi16(d0, d2);
- *(int *)dest = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- d0 = _mm_packus_epi16(d4, d6);
- *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
- }
-}
-
-void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- __m128i in[8];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- // Load rows, packed two per element of 'in'.
- // We pack into the bottom half of 'in' so that the
- // later repacking stage can pack into the
- // top half without overwriting anything
- in[4] = load_input_data(input + 0 * 8);
- in[5] = load_input_data(input + 1 * 8);
- in[6] = load_input_data(input + 2 * 8);
- in[7] = load_input_data(input + 3 * 8);
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- aom_idct4_sse2(in + 4);
- aom_idct4_sse2(in + 6);
- break;
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST:
-#endif
- aom_iadst4_sse2(in + 4);
- aom_iadst4_sse2(in + 6);
- break;
-#if CONFIG_EXT_TX
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
- iidtx4_sse2(in + 4);
- array_transpose_4x4(in + 4);
- iidtx4_sse2(in + 6);
- array_transpose_4x4(in + 6);
- break;
-#endif
- default: assert(0); break;
- }
-
- scale_sqrt2_8x4(in + 4);
-
- // Repack data
- in[0] = _mm_unpacklo_epi64(in[4], in[6]);
- in[1] = _mm_unpackhi_epi64(in[4], in[6]);
- in[2] = _mm_unpacklo_epi64(in[5], in[7]);
- in[3] = _mm_unpackhi_epi64(in[5], in[7]);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- aom_idct8_sse2(in);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- aom_iadst8_sse2(in);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX:
- iidtx8_sse2(in);
- array_transpose_8x8(in, in);
- break;
-#endif
- default: assert(0); break;
- }
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
-#endif
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
- in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
- in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
- in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
- in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
- in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
- in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
- in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
- break;
- case FLIPADST_FLIPADST:
- in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
- in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
- in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
- in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
- in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
- in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
- in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
- in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
- FLIPUD_PTR(dest, stride, 8);
- break;
-#endif
- default: assert(0); break;
- }
- in[0] = _mm_unpacklo_epi64(in[0], in[1]);
- in[1] = _mm_unpacklo_epi64(in[2], in[3]);
- in[2] = _mm_unpacklo_epi64(in[4], in[5]);
- in[3] = _mm_unpacklo_epi64(in[6], in[7]);
- write_buffer_4x8_round5(dest, in, stride);
-}
-
-// Note: The 16-column 32-element transforms take input in the form of four
-// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
-// of the overall 16x32 input buffer.
-static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- array_transpose_16x16(tl, tr);
- array_transpose_16x16(bl, br);
- idct32_8col(tl, bl);
- idct32_8col(tr, br);
-}
-
-static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- __m128i tmpl[16], tmpr[16];
- int i;
-
- // Copy the top half of the input to temporary storage
- for (i = 0; i < 16; ++i) {
- tmpl[i] = tl[i];
- tmpr[i] = tr[i];
- }
-
- // Generate the top half of the output
- for (i = 0; i < 16; ++i) {
- tl[i] = _mm_slli_epi16(bl[i], 2);
- tr[i] = _mm_slli_epi16(br[i], 2);
- }
- array_transpose_16x16(tl, tr);
-
- // Copy the temporary storage back to the bottom half of the input
- for (i = 0; i < 16; ++i) {
- bl[i] = tmpl[i];
- br[i] = tmpr[i];
- }
-
- // Generate the bottom half of the output
- scale_sqrt2_8x16(bl);
- scale_sqrt2_8x16(br);
- aom_idct16_sse2(bl, br); // Includes a transposition
-}
-
-#if CONFIG_EXT_TX
-static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
- __m128i *br) {
- int i;
- array_transpose_16x16(tl, tr);
- array_transpose_16x16(bl, br);
- for (i = 0; i < 16; ++i) {
- tl[i] = _mm_slli_epi16(tl[i], 2);
- tr[i] = _mm_slli_epi16(tr[i], 2);
- bl[i] = _mm_slli_epi16(bl[i], 2);
- br[i] = _mm_slli_epi16(br[i], 2);
- }
-}
-#endif // CONFIG_EXT_TX
-
-static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
- __m128i *intr, __m128i *inbl,
- __m128i *inbr, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- int i;
-
- for (i = 0; i < 16; ++i) {
- intl[i] = _mm_adds_epi16(intl[i], final_rounding);
- intr[i] = _mm_adds_epi16(intr[i], final_rounding);
- inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
- inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
- intl[i] = _mm_srai_epi16(intl[i], 6);
- intr[i] = _mm_srai_epi16(intr[i], 6);
- inbl[i] = _mm_srai_epi16(inbl[i], 6);
- inbr[i] = _mm_srai_epi16(inbr[i], 6);
- RECON_AND_STORE(dest + i * stride + 0, intl[i]);
- RECON_AND_STORE(dest + i * stride + 8, intr[i]);
- RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
- RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
- }
-}
-
-void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m128i intl[16], intr[16], inbl[16], inbr[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
-
- int i;
- for (i = 0; i < 16; ++i) {
- intl[i] = load_input_data(input + i * 16 + 0);
- intr[i] = load_input_data(input + i * 16 + 8);
- inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
- inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
- }
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- aom_idct16_sse2(intl, intr);
- aom_idct16_sse2(inbl, inbr);
- break;
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST:
-#endif
- aom_iadst16_sse2(intl, intr);
- aom_iadst16_sse2(inbl, inbr);
- break;
-#if CONFIG_EXT_TX
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
- iidtx16_sse2(intl, intr);
- iidtx16_sse2(inbl, inbr);
- break;
-#endif
- default: assert(0); break;
- }
-
- scale_sqrt2_8x16(intl);
- scale_sqrt2_8x16(intr);
- scale_sqrt2_8x16(inbl);
- scale_sqrt2_8x16(inbr);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- idct32_16col(intl, intr, inbl, inbr);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- ihalfright32_16col(intl, intr, inbl, inbr);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
-#endif
- default: assert(0); break;
- }
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
-#endif
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- for (i = 0; i < 16; ++i) {
- __m128i tmp = intl[i];
- intl[i] = mm_reverse_epi16(intr[i]);
- intr[i] = mm_reverse_epi16(tmp);
- tmp = inbl[i];
- inbl[i] = mm_reverse_epi16(inbr[i]);
- inbr[i] = mm_reverse_epi16(tmp);
- }
- break;
- case FLIPADST_FLIPADST:
- for (i = 0; i < 16; ++i) {
- __m128i tmp = intl[i];
- intl[i] = mm_reverse_epi16(intr[i]);
- intr[i] = mm_reverse_epi16(tmp);
- tmp = inbl[i];
- inbl[i] = mm_reverse_epi16(inbr[i]);
- inbr[i] = mm_reverse_epi16(tmp);
- }
- FLIPUD_PTR(dest, stride, 32);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
-}
-
-static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
- __m128i *in1, __m128i *in2,
- __m128i *in3, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- int i;
-
- for (i = 0; i < 16; ++i) {
- in0[i] = _mm_adds_epi16(in0[i], final_rounding);
- in1[i] = _mm_adds_epi16(in1[i], final_rounding);
- in2[i] = _mm_adds_epi16(in2[i], final_rounding);
- in3[i] = _mm_adds_epi16(in3[i], final_rounding);
- in0[i] = _mm_srai_epi16(in0[i], 6);
- in1[i] = _mm_srai_epi16(in1[i], 6);
- in2[i] = _mm_srai_epi16(in2[i], 6);
- in3[i] = _mm_srai_epi16(in3[i], 6);
- RECON_AND_STORE(dest + i * stride + 0, in0[i]);
- RECON_AND_STORE(dest + i * stride + 8, in1[i]);
- RECON_AND_STORE(dest + i * stride + 16, in2[i]);
- RECON_AND_STORE(dest + i * stride + 24, in3[i]);
- }
-}
-
-void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- __m128i in0[16], in1[16], in2[16], in3[16];
- const TX_TYPE tx_type = txfm_param->tx_type;
- int i;
-
- for (i = 0; i < 16; ++i) {
- in0[i] = load_input_data(input + i * 32 + 0);
- in1[i] = load_input_data(input + i * 32 + 8);
- in2[i] = load_input_data(input + i * 32 + 16);
- in3[i] = load_input_data(input + i * 32 + 24);
- }
-
- // Row transform
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case H_DCT:
-#endif
- idct32_16col(in0, in1, in2, in3);
- break;
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case FLIPADST_FLIPADST:
- case ADST_FLIPADST:
- case FLIPADST_ADST:
- case H_ADST:
- case H_FLIPADST:
-#endif
- ihalfright32_16col(in0, in1, in2, in3);
- break;
-#if CONFIG_EXT_TX
- case V_FLIPADST:
- case V_ADST:
- case V_DCT:
- case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
-#endif
- default: assert(0); break;
- }
-
- scale_sqrt2_8x16(in0);
- scale_sqrt2_8x16(in1);
- scale_sqrt2_8x16(in2);
- scale_sqrt2_8x16(in3);
-
- // Column transform
- switch (tx_type) {
- case DCT_DCT:
- case DCT_ADST:
-#if CONFIG_EXT_TX
- case DCT_FLIPADST:
- case V_DCT:
-#endif
- aom_idct16_sse2(in0, in1);
- aom_idct16_sse2(in2, in3);
- break;
- case ADST_DCT:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case FLIPADST_ADST:
- case ADST_FLIPADST:
- case FLIPADST_FLIPADST:
- case FLIPADST_DCT:
- case V_ADST:
- case V_FLIPADST:
-#endif
- aom_iadst16_sse2(in0, in1);
- aom_iadst16_sse2(in2, in3);
- break;
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case H_FLIPADST:
- case IDTX:
- iidtx16_sse2(in0, in1);
- iidtx16_sse2(in2, in3);
- break;
-#endif
- default: assert(0); break;
- }
-
- switch (tx_type) {
- case DCT_DCT:
- case ADST_DCT:
- case DCT_ADST:
- case ADST_ADST:
-#if CONFIG_EXT_TX
- case H_DCT:
- case H_ADST:
- case V_ADST:
- case V_DCT:
- case IDTX:
-#endif
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- case FLIPADST_ADST:
- case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
- case DCT_FLIPADST:
- case ADST_FLIPADST:
- case H_FLIPADST:
- for (i = 0; i < 16; ++i) {
- __m128i tmp1 = in0[i];
- __m128i tmp2 = in1[i];
- in0[i] = mm_reverse_epi16(in3[i]);
- in1[i] = mm_reverse_epi16(in2[i]);
- in2[i] = mm_reverse_epi16(tmp2);
- in3[i] = mm_reverse_epi16(tmp1);
- }
- break;
- case FLIPADST_FLIPADST:
- for (i = 0; i < 16; ++i) {
- __m128i tmp1 = in0[i];
- __m128i tmp2 = in1[i];
- in0[i] = mm_reverse_epi16(in3[i]);
- in1[i] = mm_reverse_epi16(in2[i]);
- in2[i] = mm_reverse_epi16(tmp2);
- in3[i] = mm_reverse_epi16(tmp1);
- }
- FLIPUD_PTR(dest, stride, 16);
- break;
-#endif
- default: assert(0); break;
- }
- write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
-}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
index ea4acff33..0c857b583 100644
--- a/third_party/aom/av1/common/x86/intra_edge_sse4.c
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -12,8 +12,8 @@
#include <assert.h>
#include <smmintrin.h>
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
if (!strength) return;
@@ -39,9 +39,9 @@ void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
// Adjust input pointer for filter support area
uint8_t *in = (strength == 3) ? p - 1 : p;
- // Avoid modifying first/last samples
+ // Avoid modifying first sample
uint8_t *out = p + 1;
- int len = sz - 2;
+ int len = sz - 1;
const int use_3tap_filter = (strength < 3);
@@ -133,9 +133,9 @@ void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
// Adjust input pointer for filter support area
uint16_t *in = (strength == 3) ? p - 1 : p;
- // Avoid modifying first/last samples
+ // Avoid modifying first sample
uint16_t *out = p + 1;
- int len = sz - 2;
+ int len = sz - 1;
const int use_3tap_filter = (strength < 3);
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 000000000..ac1d2c9ca
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ __m256i filt[4], coeffs[4];
+
+ assert(bits >= 0);
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+ const __m256i round_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + j + src_stride]))),
+ 0x20);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ // +1 to compensate for dividing the filter coeffs by 2
+ const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+ const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+ const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i coeffs[4], s[8];
+
+ assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+ (void)conv_params;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i data_ref_0_hi = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j + 8]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res_lo =
+ comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i comp_avg_res_hi =
+ comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+
+ const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+}
+
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+ filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+ filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+ __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ int i, j;
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ _mm256_castsi256_si128(res_0));
+ } else {
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]))),
+ 0x20);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 000000000..4df7bd42e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ __m128i coeffs[4];
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+ if (w == 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+ }
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+ const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
+
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+ const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ __m128i coeffs[4];
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+ if (w == 4) {
+ __m128i s[8], src6, res, res_shift;
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ res_16b = _mm_packs_epi32(res_shift, res_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+ res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 000000000..e4d51ac8d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.c b/third_party/aom/av1/common/x86/pvq_sse4.c
deleted file mode 100644
index b3ed9efdf..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#include <float.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/pvq_sse4.h"
-#include "../odintrin.h"
-#include "av1/common/pvq.h"
-
-#define EPSILON 1e-15f
-
-static __m128 horizontal_sum_ps(__m128 x) {
- x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
- x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
- return x;
-}
-
-static __m128i horizontal_sum_epi32(__m128i x) {
- x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
- x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
- return x;
-}
-
-static INLINE float rsqrtf(float x) {
- float y;
- _mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
- return y;
-}
-
-/** Find the codepoint on the given PSphere closest to the desired
- * vector. This is a float-precision PVQ search just to make sure
- * our tests aren't limited by numerical accuracy. It's close to the
- * pvq_search_rdo_double_c implementation, but is not bit accurate and
- * it performs slightly worse on PSNR. One reason is that this code runs
- * more RDO iterations than the C code. It also uses single precision
- * floating point math, whereas the C version uses double precision.
- *
- * @param [in] xcoeff input vector to quantize (x in the math doc)
- * @param [in] n number of dimensions
- * @param [in] k number of pulses
- * @param [out] ypulse optimal codevector found (y in the math doc)
- * @param [in] g2 multiplier for the distortion (typically squared
- * gain units)
- * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
- * @param [in] prev_k number of pulses already in ypulse that we should
- * reuse for the search (or 0 for a new search)
- * @return cosine distance between x and y (between 0 and 1)
- */
-double pvq_search_rdo_double_sse4_1(const od_val16 *xcoeff, int n, int k,
- int *ypulse, double g2,
- double pvq_norm_lambda, int prev_k) {
- int i, j;
- int reuse_pulses = prev_k > 0 && prev_k <= k;
- /* TODO - This blows our 8kB stack space budget and should be fixed when
- converting PVQ to fixed point. */
- float xx = 0, xy = 0, yy = 0;
- float x[MAXN + 3];
- float y[MAXN + 3];
- float sign_y[MAXN + 3];
- for (i = 0; i < n; i++) {
- float tmp = (float)xcoeff[i];
- xx += tmp * tmp;
- x[i] = xcoeff[i];
- }
-
- x[n] = x[n + 1] = x[n + 2] = 0;
- ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
-
- __m128 sums = _mm_setzero_ps();
- for (i = 0; i < n; i += 4) {
- __m128 x4 = _mm_loadu_ps(&x[i]);
- __m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
- /* Save the sign, we'll put it back later. */
- _mm_storeu_ps(&sign_y[i], s4);
- /* Get rid of the sign. */
- x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
- sums = _mm_add_ps(sums, x4);
- if (!reuse_pulses) {
- /* Clear y and ypulse in case we don't do the projection. */
- _mm_storeu_ps(&y[i], _mm_setzero_ps());
- _mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
- }
- _mm_storeu_ps(&x[i], x4);
- }
- sums = horizontal_sum_ps(sums);
- int pulses_left = k;
- {
- __m128i pulses_sum;
- __m128 yy4, xy4;
- xy4 = yy4 = _mm_setzero_ps();
- pulses_sum = _mm_setzero_si128();
- if (reuse_pulses) {
- /* We reuse pulses from a previous search so we don't have to search them
- again. */
- for (j = 0; j < n; j += 4) {
- __m128 x4, y4;
- __m128i iy4;
- iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
- pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
- y4 = _mm_cvtepi32_ps(iy4);
- x4 = _mm_loadu_ps(&x[j]);
- xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
- yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
- /* Double the y[] vector so we don't have to do it in the search loop.
- */
- _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
- }
- pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
- xy4 = horizontal_sum_ps(xy4);
- xy = _mm_cvtss_f32(xy4);
- yy4 = horizontal_sum_ps(yy4);
- yy = _mm_cvtss_f32(yy4);
- } else if (k > (n >> 1)) {
- /* Do a pre-search by projecting on the pyramid. */
- __m128 rcp4;
- float sum = _mm_cvtss_f32(sums);
- /* If x is too small, just replace it with a pulse at 0. This prevents
- infinities and NaNs from causing too many pulses to be allocated. Here,
- 64 is an
- approximation of infinity. */
- if (sum <= EPSILON) {
- x[0] = 1.f;
- for (i = 1; i < n; i++) {
- x[i] = 0;
- }
- sums = _mm_set_ps1(1.f);
- }
- /* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
- rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
- xy4 = yy4 = _mm_setzero_ps();
- pulses_sum = _mm_setzero_si128();
- for (j = 0; j < n; j += 4) {
- __m128 rx4, x4, y4;
- __m128i iy4;
- x4 = _mm_loadu_ps(&x[j]);
- rx4 = _mm_mul_ps(x4, rcp4);
- iy4 = _mm_cvttps_epi32(rx4);
- pulses_sum = _mm_add_epi32(pulses_sum, iy4);
- _mm_storeu_si128((__m128i *)&ypulse[j], iy4);
- y4 = _mm_cvtepi32_ps(iy4);
- xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
- yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
- /* Double the y[] vector so we don't have to do it in the search loop.
- */
- _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
- }
- pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
- xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
- yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
- }
- x[n] = x[n + 1] = x[n + 2] = -100;
- y[n] = y[n + 1] = y[n + 2] = 100;
- }
-
- /* This should never happen. */
- OD_ASSERT(pulses_left <= n + 3);
-
- float lambda_delta_rate[MAXN + 3];
- if (pulses_left) {
- /* Hoist lambda to avoid the multiply in the loop. */
- float lambda =
- 0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
- float delta_rate = 3.f / n;
- __m128 count = _mm_set_ps(3, 2, 1, 0);
- for (i = 0; i < n; i += 4) {
- _mm_storeu_ps(&lambda_delta_rate[i],
- _mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
- count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
- }
- }
- lambda_delta_rate[n] = lambda_delta_rate[n + 1] = lambda_delta_rate[n + 2] =
- 1e30f;
-
- for (i = 0; i < pulses_left; i++) {
- int best_id = 0;
- __m128 xy4, yy4;
- __m128 max, max2;
- __m128i count;
- __m128i pos;
-
- /* The squared magnitude term gets added anyway, so we might as well
- add it outside the loop. */
- yy = yy + 1;
- xy4 = _mm_load1_ps(&xy);
- yy4 = _mm_load1_ps(&yy);
- max = _mm_setzero_ps();
- pos = _mm_setzero_si128();
- count = _mm_set_epi32(3, 2, 1, 0);
- for (j = 0; j < n; j += 4) {
- __m128 x4, y4, r4;
- x4 = _mm_loadu_ps(&x[j]);
- y4 = _mm_loadu_ps(&y[j]);
- x4 = _mm_add_ps(x4, xy4);
- y4 = _mm_add_ps(y4, yy4);
- y4 = _mm_rsqrt_ps(y4);
- r4 = _mm_mul_ps(x4, y4);
- /* Subtract lambda. */
- r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
- /* Update the index of the max. */
- pos = _mm_max_epi16(
- pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
- /* Update the max. */
- max = _mm_max_ps(max, r4);
- /* Update the indices (+4) */
- count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
- }
- /* Horizontal max. */
- max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
- max2 =
- _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
- /* Now that max2 contains the max at all positions, look at which value(s)
- of the
- partial max is equal to the global max. */
- pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
- pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
- pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
- best_id = _mm_cvtsi128_si32(pos);
- OD_ASSERT(best_id < n);
- /* Updating the sums of the new pulse(s) */
- xy = xy + x[best_id];
- /* We're multiplying y[j] by two so we don't have to do it here. */
- yy = yy + y[best_id];
- /* Only now that we've made the final choice, update y/ypulse. */
- /* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
- y[best_id] += 2;
- ypulse[best_id]++;
- }
-
- /* Put the original sign back. */
- for (i = 0; i < n; i += 4) {
- __m128i y4;
- __m128i s4;
- y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
- s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
- y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
- _mm_storeu_si128((__m128i *)&ypulse[i], y4);
- }
- return xy * rsqrtf(xx * yy + FLT_MIN);
-}
diff --git a/third_party/aom/av1/common/x86/pvq_sse4.h b/third_party/aom/av1/common/x86/pvq_sse4.h
deleted file mode 100644
index 3c4ce8543..000000000
--- a/third_party/aom/av1/common/x86/pvq_sse4.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
-#define AOM_COMMON_PVQ_X86_SSE4_H_
-#endif // AOM_COMMON_PVQ_X86_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 000000000..ffbb31849
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 16) {
+ av1_build_compound_diffwtd_mask_highbd_ssse3(
+ mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+ } else {
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ assert(bd >= 8);
+ assert((w % 16) == 0);
+ const __m256i y0 = _mm256_setzero_si256();
+ const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+ _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 000000000..5171ca493
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+ const __m128i s1) {
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+ return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m128i mask_base = _mm_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+ const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+ const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+ const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+ *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+ *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += 8;
+ i += 2;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+ __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+ s0 = _mm_cvtepu8_epi16(s0);
+ s1 = _mm_cvtepu8_epi16(s1);
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+ _mm_storel_epi64((__m128i *)mask, m8);
+ src0 += stride0;
+ src1 += stride1;
+ mask += 8;
+ i += 1;
+ } while (i < h);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+ do {
+ int j = 0;
+ do {
+ const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+ const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+ const __m128i s0L = _mm_cvtepu8_epi16(s0);
+ const __m128i s1L = _mm_cvtepu8_epi16(s1);
+ const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+ const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+ const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+ const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+ const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+ _mm_store_si128((__m128i *)(mask + j), m8);
+ j += 16;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+ const int mask_base = 38;
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+ const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+ const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i add_const =
+ _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+ const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+ int i, j;
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data_src0 =
+ _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+ const __m128i data_src1 =
+ _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+ const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+ const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+ const __m128i diff = _mm_max_epu16(diffa, diffb);
+ const __m128i diff_round =
+ _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+ const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+ __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+ // clamp to 0 can be skipped since we are using add and saturate
+ // instruction
+
+ const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+ const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+ // Store values into the destination buffer
+ __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+ if ((w - j) > 4) {
+ _mm_storel_epi64(dst, res_8);
+ } else { // w==4
+ *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 000000000..cf684447c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 8) {
+ av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+ src1, src1_stride, h, w, bd);
+ } else {
+ assert(bd >= 8);
+ assert((w % 8) == 0);
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ const __m128i x0 = _mm_setzero_si128();
+ const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+ _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m128i xmask_base = _mm_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 000000000..375def62e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+ return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+ return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+ const __m256i x01 = _mm256_slli_si256(x, 4);
+ const __m256i x02 = _mm256_add_epi32(x, x01);
+ const __m256i x03 = _mm256_slli_si256(x02, 8);
+ const __m256i x04 = _mm256_add_epi32(x02, x03);
+ const int32_t s = _mm256_extract_epi32(x04, 3);
+ const __m128i s01 = _mm_set1_epi32(s);
+ const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+ return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+ unsigned int i = 0;
+ for (i = 0; i < (count & 0xffffffe0); i += 32) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+ }
+ for (; i < (count & 0xfffffff8); i += 8) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ }
+ for (; i < count; i++) {
+ dest[i] = 0;
+ }
+ return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+ const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+ const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+ const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+ const __m256i u = _mm256_sub_epi32(tr, tl);
+ const __m256i v = _mm256_sub_epi32(br, bl);
+ return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+ return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+ __m256i an, bb;
+ if (bit_depth > 8) {
+ const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m256i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m256i a =
+ _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+ const __m256i b =
+ _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm256_madd_epi16(b, b);
+ an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+ } else {
+ bb = _mm256_madd_epi16(sum1, sum1);
+ an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+ }
+ return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fours = _mm256_add_epi32(
+ xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+ const __m256i threes =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+ return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+ threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fives =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+ const __m256i sixes = _mm256_add_epi32(xt, xb);
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+
+ const __m256i fives = _mm256_add_epi32(xl, xr);
+ const __m256i sixes = x;
+
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m256i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m256i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m256i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+ // Ctl and Dtl is 32-byte aligned.
+ const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+ DECLARE_ALIGNED(32, int32_t,
+ buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 32 bytes for efficiency.
+ int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array.
+ int32_t *Atl = buf + 0 * buf_elts + 7;
+ int32_t *Btl = buf + 1 * buf_elts + 7;
+ int32_t *Ctl = buf + 2 * buf_elts + 7;
+ int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+}
+
+void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
+ width, eps, bit_depth, highbd);
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+ decode_xq(xqd, xq, params);
+
+ __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 16 pixels
+ for (int j = 0; j < width; j += 16) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m256i ep_0, ep_1;
+ __m128i src_0, src_1;
+ if (highbd) {
+ src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+ ep_0 = _mm256_cvtepu16_epi32(src_0);
+ ep_1 = _mm256_cvtepu16_epi32(src_1);
+ } else {
+ src_0 = xx_loadu_128(dat8ij);
+ ep_0 = _mm256_cvtepu8_epi32(src_0);
+ ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+ }
+
+ const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+ const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+ __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+ const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+ const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_0 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ // Note that packing into 16 bits messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+ const __m256i res = _mm256_min_epi16(tmp2, max);
+ yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ // Note that each pack messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i res =
+ _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+ const __m128i res2 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+ xx_storeu_128(dst8 + m, res2);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index 9de9177c1..a42c94028 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -1,1821 +1,643 @@
#include <smmintrin.h>
-#include "./aom_config.h"
-#include "./av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
#include "av1/common/restoration.h"
#include "aom_dsp/x86/synonyms.h"
-/* Calculate four consecutive entries of the intermediate A and B arrays
- (corresponding to the first loop in the C version of
- av1_selfguided_restoration)
-*/
-static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
- __m128i *one_over_n_, __m128i *s_, int bit_depth,
- int idx, int32_t *A, int32_t *B) {
- __m128i a, b, p;
- __m128i one_over_n = *one_over_n_;
- __m128i s = *s_;
-#if CONFIG_HIGHBITDEPTH
- if (bit_depth > 8) {
- __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
- __m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
- __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
- __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
- a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
- b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
- a = _mm_mullo_epi32(a, n);
- b = _mm_mullo_epi32(b, b);
- p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
- } else {
-#endif
- (void)bit_depth;
- a = _mm_mullo_epi32(sum_sq, n);
- b = _mm_mullo_epi32(sum, sum);
- p = _mm_sub_epi32(a, b);
-#if CONFIG_HIGHBITDEPTH
- }
-#endif
-
- __m128i rounding_z = _mm_set1_epi32((1 << SGRPROJ_MTABLE_BITS) >> 1);
- __m128i z = _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rounding_z),
- SGRPROJ_MTABLE_BITS);
- z = _mm_min_epi32(z, _mm_set1_epi32(255));
-
- // 'Gather' type instructions are not available pre-AVX2, so synthesize a
- // gather using scalar loads.
- __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
- x_by_xplus1[_mm_extract_epi32(z, 2)],
- x_by_xplus1[_mm_extract_epi32(z, 1)],
- x_by_xplus1[_mm_extract_epi32(z, 0)]);
-
- _mm_storeu_si128((__m128i *)&A[idx], a_res);
-
- __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
- __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
- __m128i b_int =
- _mm_mullo_epi32(a_complement, _mm_mullo_epi32(sum, one_over_n));
- __m128i b_res =
- _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
-
- _mm_storeu_si128((__m128i *)&B[idx], b_res);
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+ return _mm_cvtepu8_epi32(xx_loadl_32(p));
}
-static void selfguided_restoration_1_v(uint8_t *src, int width, int height,
- int src_stride, int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
-
- // Vertical sum
- // When the width is not a multiple of 4, we know that 'stride' is rounded up
- // to a multiple of 4. So it is safe for this loop to calculate extra columns
- // at the right-hand edge of the frame.
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, x, y, x2, y2;
- __m128i sum, sum_sq, tmp;
-
- a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
-
- sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
- tmp = _mm_unpacklo_epi16(a, b);
- sum_sq = _mm_madd_epi16(tmp, tmp);
-
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
-
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- for (i = 1; i < height - 2; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
- y = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i + 2) * src_stride + j]));
-
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
-
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
- }
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+ return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
- }
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+ const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+ return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
}
-static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width,
- int height, int buf_stride, int eps,
- int bit_depth) {
- int i, j;
-
- // Horizontal sum
- int width_extend = (width + 3) & ~3;
- for (i = 0; i < height; ++i) {
- int h = AOMMIN(2, height - i) + AOMMIN(1, i);
-
- __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
- __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
- __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
- __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
- // Note: The _mm_slli_si128 call sets up a register containing
- // {0, A[i * buf_stride], ..., A[i * buf_stride + 2]},
- // so that the first element of 'sum' (which should only add two values
- // together) ends up calculated correctly.
- __m128i sum_ = _mm_add_epi32(_mm_slli_si128(b1, 4),
- _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)));
- __m128i sum_sq_ = _mm_add_epi32(
- _mm_slli_si128(a1, 4), _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)));
- __m128i n = _mm_set_epi32(3 * h, 3 * h, 3 * h, 2 * h);
- __m128i one_over_n =
- _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[3 * h - 1],
- one_by_x[3 * h - 1], one_by_x[2 * h - 1]);
- __m128i s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
- sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
- B);
-
- n = _mm_set1_epi32(3 * h);
- one_over_n = _mm_set1_epi32(one_by_x[3 * h - 1]);
- s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
-
- // Re-align a1 and b1 so that they start at index i * buf_stride + 3
- a2 = _mm_alignr_epi8(a2, a1, 12);
- b2 = _mm_alignr_epi8(b2, b1, 12);
-
- // Note: When the width is not a multiple of 4, this loop may end up
- // writing to the last 4 columns of the frame, potentially with incorrect
- // values (especially for r=2 and r=3).
- // This is fine, since we fix up those values in the block after this
- // loop, and in exchange we never have more than four values to
- // write / fix up after this loop finishes.
- for (j = 4; j < width_extend - 4; j += 4) {
- a1 = a2;
- b1 = b2;
- a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
- b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
- /* Loop invariant: At this point,
- a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
- a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
- and similar for b1,b2 and B
- */
- sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
- _mm_alignr_epi8(b2, b1, 8)));
- sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
- _mm_alignr_epi8(a2, a1, 8)));
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
- i * buf_stride + j, A, B);
- }
- __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
- __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
-
- j = width - 4;
- switch (width % 4) {
- case 0:
- a1 = a2;
- b1 = b2;
- a2 = a3;
- b2 = b3;
- break;
- case 1:
- a1 = _mm_alignr_epi8(a2, a1, 4);
- b1 = _mm_alignr_epi8(b2, b1, 4);
- a2 = _mm_alignr_epi8(a3, a2, 4);
- b2 = _mm_alignr_epi8(b3, b2, 4);
- break;
- case 2:
- a1 = _mm_alignr_epi8(a2, a1, 8);
- b1 = _mm_alignr_epi8(b2, b1, 8);
- a2 = _mm_alignr_epi8(a3, a2, 8);
- b2 = _mm_alignr_epi8(b3, b2, 8);
- break;
- case 3:
- a1 = _mm_alignr_epi8(a2, a1, 12);
- b1 = _mm_alignr_epi8(b2, b1, 12);
- a2 = _mm_alignr_epi8(a3, a2, 12);
- b2 = _mm_alignr_epi8(b3, b2, 12);
- break;
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
}
-
- // Zero out the data loaded from "off the edge" of the array
- __m128i zero = _mm_setzero_si128();
- a2 = _mm_blend_epi16(a2, zero, 0xfc);
- b2 = _mm_blend_epi16(b2, zero, 0xfc);
-
- sum_ = _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
- _mm_alignr_epi8(b2, b1, 8)));
- sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
- _mm_alignr_epi8(a2, a1, 8)));
- n = _mm_set_epi32(2 * h, 3 * h, 3 * h, 3 * h);
- one_over_n = _mm_set_epi32(one_by_x[2 * h - 1], one_by_x[3 * h - 1],
- one_by_x[3 * h - 1], one_by_x[3 * h - 1]);
- s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1],
- sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
- A, B);
}
}
-static void selfguided_restoration_2_v(uint8_t *src, int width, int height,
- int src_stride, int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
-
- // Vertical sum
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, c, c2, x, y, x2, y2;
- __m128i sum, sum_sq, tmp;
-
- a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
- c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
- sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
- // Important: Since c may be up to 2^8, the result on squaring may
- // be up to 2^16. So we need to zero-extend, not sign-extend.
- c2 = _mm_cvtepu16_epi32(_mm_mullo_epi16(c, c));
- tmp = _mm_unpacklo_epi16(a, b);
- sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
- _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+ const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
- for (i = 2; i < height - 3; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
- x = _mm_cvtepu8_epi32(
- _mm_cvtsi32_si128(*((int *)&src[(i - 2) * src_stride + j])));
- y = _mm_cvtepu8_epi32(
- _mm_cvtsi32_si128(*((int *)&src[(i + 3) * src_stride + j])));
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
-
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
}
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
}
}
-static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width,
- int height, int buf_stride, int eps,
- int bit_depth) {
- int i, j;
-
- // Horizontal sum
- int width_extend = (width + 3) & ~3;
- for (i = 0; i < height; ++i) {
- int h = AOMMIN(3, height - i) + AOMMIN(2, i);
-
- __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
- __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
- __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
- __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
- __m128i sum_ = _mm_add_epi32(
- _mm_add_epi32(
- _mm_add_epi32(_mm_slli_si128(b1, 8), _mm_slli_si128(b1, 4)),
- _mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4))),
- _mm_alignr_epi8(b2, b1, 8));
- __m128i sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(
- _mm_add_epi32(_mm_slli_si128(a1, 8), _mm_slli_si128(a1, 4)),
- _mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4))),
- _mm_alignr_epi8(a2, a1, 8));
-
- __m128i n = _mm_set_epi32(5 * h, 5 * h, 4 * h, 3 * h);
- __m128i one_over_n =
- _mm_set_epi32(one_by_x[5 * h - 1], one_by_x[5 * h - 1],
- one_by_x[4 * h - 1], one_by_x[3 * h - 1]);
- __m128i s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
- sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
- B);
-
- // Re-align a1 and b1 so that they start at index i * buf_stride + 2
- a2 = _mm_alignr_epi8(a2, a1, 8);
- b2 = _mm_alignr_epi8(b2, b1, 8);
-
- n = _mm_set1_epi32(5 * h);
- one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
- s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
-
- for (j = 4; j < width_extend - 4; j += 4) {
- a1 = a2;
- a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
- b1 = b2;
- b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
- /* Loop invariant: At this point,
- a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
- a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
- and similar for b1,b2 and B
- */
- sum_ = _mm_add_epi32(
- _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
- _mm_alignr_epi8(b2, b1, 8))),
- _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
- sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
- _mm_alignr_epi8(a2, a1, 8))),
- _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
- i * buf_stride + j, A, B);
- }
- // If the width is not a multiple of 4, we need to reset j to width - 4
- // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
- __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
- __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
-
- j = width - 4;
- switch (width % 4) {
- case 0:
- a1 = a2;
- b1 = b2;
- a2 = a3;
- b2 = b3;
- break;
- case 1:
- a1 = _mm_alignr_epi8(a2, a1, 4);
- b1 = _mm_alignr_epi8(b2, b1, 4);
- a2 = _mm_alignr_epi8(a3, a2, 4);
- b2 = _mm_alignr_epi8(b3, b2, 4);
- break;
- case 2:
- a1 = _mm_alignr_epi8(a2, a1, 8);
- b1 = _mm_alignr_epi8(b2, b1, 8);
- a2 = _mm_alignr_epi8(a3, a2, 8);
- b2 = _mm_alignr_epi8(b3, b2, 8);
- break;
- case 3:
- a1 = _mm_alignr_epi8(a2, a1, 12);
- b1 = _mm_alignr_epi8(b2, b1, 12);
- a2 = _mm_alignr_epi8(a3, a2, 12);
- b2 = _mm_alignr_epi8(b3, b2, 12);
- break;
- }
-
- // Zero out the data loaded from "off the edge" of the array
- __m128i zero = _mm_setzero_si128();
- a2 = _mm_blend_epi16(a2, zero, 0xf0);
- b2 = _mm_blend_epi16(b2, zero, 0xf0);
-
- sum_ = _mm_add_epi32(
- _mm_add_epi32(b1, _mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
- _mm_alignr_epi8(b2, b1, 8))),
- _mm_add_epi32(_mm_alignr_epi8(b2, b1, 12), b2));
- sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
- _mm_alignr_epi8(a2, a1, 8))),
- _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2));
-
- n = _mm_set_epi32(3 * h, 4 * h, 5 * h, 5 * h);
- one_over_n = _mm_set_epi32(one_by_x[3 * h - 1], one_by_x[4 * h - 1],
- one_by_x[5 * h - 1], one_by_x[5 * h - 1]);
- s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1],
- sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
- A, B);
- }
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+ const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+ const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+ const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+ const __m128i u = _mm_sub_epi32(tr, tl);
+ const __m128i v = _mm_sub_epi32(br, bl);
+ return _mm_sub_epi32(v, u);
}
-static void selfguided_restoration_3_v(uint8_t *src, int width, int height,
- int src_stride, int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
-
- // Vertical sum over 7-pixel regions, 4 columns at a time
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, c, d, x, y, x2, y2;
- __m128i sum, sum_sq, tmp, tmp2;
+static __m128i round_for_shift(unsigned shift) {
+ return _mm_set1_epi32((1 << shift) >> 1);
+}
- a = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[j]));
- b = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[src_stride + j]));
- c = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[2 * src_stride + j]));
- d = _mm_cvtepu8_epi16(xx_loadl_32((__m128i *)&src[3 * src_stride + j]));
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+ __m128i an, bb;
+ if (bit_depth > 8) {
+ const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m128i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+ const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm_madd_epi16(b, b);
+ an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+ } else {
+ bb = _mm_madd_epi16(sum1, sum1);
+ an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+ }
+ return _mm_sub_epi32(an, bb);
+}
- sum = _mm_cvtepi16_epi32(
- _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
- tmp = _mm_unpacklo_epi16(a, b);
- tmp2 = _mm_unpacklo_epi16(c, d);
- sum_sq =
- _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[4 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
- _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[5 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
- _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
- x = _mm_cvtepu8_epi32(xx_loadl_32((__m128i *)&src[6 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
- for (i = 3; i < height - 4; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+ x_by_xplus1[_mm_extract_epi32(z, 2)],
+ x_by_xplus1[_mm_extract_epi32(z, 1)],
+ x_by_xplus1[_mm_extract_epi32(z, 0)]);
- x = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i - 3) * src_stride + j]));
- y = _mm_cvtepu8_epi32(xx_loadl_32(&src[(i + 4) * src_stride + j]));
+ xx_storeu_128(A + i * buf_stride + j, a_res);
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ xx_storeu_128(B + i * buf_stride + j, b_res);
}
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 3) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 2) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu8_epi32(
- xx_loadl_32((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
}
}
-static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
- int height, int buf_stride, int eps,
- int bit_depth) {
- int i, j;
- // Horizontal sum over 7-pixel regions of dst
- int width_extend = (width + 3) & ~3;
- for (i = 0; i < height; ++i) {
- int h = AOMMIN(4, height - i) + AOMMIN(3, i);
-
- __m128i a1 = _mm_loadu_si128((__m128i *)&A[i * buf_stride]);
- __m128i b1 = _mm_loadu_si128((__m128i *)&B[i * buf_stride]);
- __m128i a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 4]);
- __m128i b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 4]);
-
- __m128i sum_ = _mm_add_epi32(
- _mm_add_epi32(
- _mm_add_epi32(_mm_slli_si128(b1, 12), _mm_slli_si128(b1, 8)),
- _mm_add_epi32(_mm_slli_si128(b1, 4), b1)),
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b2, b1, 4),
- _mm_alignr_epi8(b2, b1, 8)),
- _mm_alignr_epi8(b2, b1, 12)));
- __m128i sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(
- _mm_add_epi32(_mm_slli_si128(a1, 12), _mm_slli_si128(a1, 8)),
- _mm_add_epi32(_mm_slli_si128(a1, 4), a1)),
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(a2, a1, 4),
- _mm_alignr_epi8(a2, a1, 8)),
- _mm_alignr_epi8(a2, a1, 12)));
-
- __m128i n = _mm_set_epi32(7 * h, 6 * h, 5 * h, 4 * h);
- __m128i one_over_n =
- _mm_set_epi32(one_by_x[7 * h - 1], one_by_x[6 * h - 1],
- one_by_x[5 * h - 1], one_by_x[4 * h - 1]);
- __m128i s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1],
- sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A,
- B);
-
- // Re-align a1 and b1 so that they start at index i * buf_stride + 1
- a2 = _mm_alignr_epi8(a2, a1, 4);
- b2 = _mm_alignr_epi8(b2, b1, 4);
-
- n = _mm_set1_epi32(7 * h);
- one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
- s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
-
- for (j = 4; j < width_extend - 4; j += 4) {
- a1 = a2;
- a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
- b1 = b2;
- b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
- __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
- __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
- /* Loop invariant: At this point,
- a1 = original A[i * buf_stride + j - 3 : i * buf_stride + j + 1]
- a2 = original A[i * buf_stride + j + 1 : i * buf_stride + j + 5]
- a3 = original A[i * buf_stride + j + 5 : i * buf_stride + j + 9]
- and similar for b1,b2,b3 and B
- */
- sum_ = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
- _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
- _mm_alignr_epi8(b2, b1, 12))),
- _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(b3, b2, 4)),
- _mm_alignr_epi8(b3, b2, 8)));
- sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
- _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
- _mm_alignr_epi8(a2, a1, 12))),
- _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)),
- _mm_alignr_epi8(a3, a2, 8)));
-
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth,
- i * buf_stride + j, A, B);
- }
- __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
- __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
-
- j = width - 4;
- switch (width % 4) {
- case 0:
- a1 = a2;
- b1 = b2;
- a2 = a3;
- b2 = b3;
- break;
- case 1:
- a1 = _mm_alignr_epi8(a2, a1, 4);
- b1 = _mm_alignr_epi8(b2, b1, 4);
- a2 = _mm_alignr_epi8(a3, a2, 4);
- b2 = _mm_alignr_epi8(b3, b2, 4);
- break;
- case 2:
- a1 = _mm_alignr_epi8(a2, a1, 8);
- b1 = _mm_alignr_epi8(b2, b1, 8);
- a2 = _mm_alignr_epi8(a3, a2, 8);
- b2 = _mm_alignr_epi8(b3, b2, 8);
- break;
- case 3:
- a1 = _mm_alignr_epi8(a2, a1, 12);
- b1 = _mm_alignr_epi8(b2, b1, 12);
- a2 = _mm_alignr_epi8(a3, a2, 12);
- b2 = _mm_alignr_epi8(b3, b2, 12);
- break;
- }
-
- // Zero out the data loaded from "off the edge" of the array
- __m128i zero = _mm_setzero_si128();
- a2 = _mm_blend_epi16(a2, zero, 0xc0);
- b2 = _mm_blend_epi16(b2, zero, 0xc0);
-
- sum_ = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(b1, _mm_alignr_epi8(b2, b1, 4)),
- _mm_add_epi32(_mm_alignr_epi8(b2, b1, 8),
- _mm_alignr_epi8(b2, b1, 12))),
- _mm_add_epi32(_mm_add_epi32(b2, _mm_alignr_epi8(zero, b2, 4)),
- _mm_alignr_epi8(zero, b2, 8)));
- sum_sq_ = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(a1, _mm_alignr_epi8(a2, a1, 4)),
- _mm_add_epi32(_mm_alignr_epi8(a2, a1, 8),
- _mm_alignr_epi8(a2, a1, 12))),
- _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(zero, a2, 4)),
- _mm_alignr_epi8(zero, a2, 8)));
-
- n = _mm_set_epi32(4 * h, 5 * h, 6 * h, 7 * h);
- one_over_n = _mm_set_epi32(one_by_x[4 * h - 1], one_by_x[5 * h - 1],
- one_by_x[6 * h - 1], one_by_x[7 * h - 1]);
- s = _mm_set_epi32(
- sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1],
- sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]);
- calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j,
- A, B);
- }
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fours = _mm_add_epi32(
+ xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+ const __m128i threes =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+ return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
}
-void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
- int dgd_stride, int32_t *dst,
- int dst_stride, int r, int eps) {
- const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
- const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
- int32_t A_[RESTORATION_PROC_UNIT_PELS];
- int32_t B_[RESTORATION_PROC_UNIT_PELS];
- int32_t *A = A_;
- int32_t *B = B_;
- int i, j;
- // Adjusting the stride of A and B here appears to avoid bad cache effects,
- // leading to a significant speed improvement.
- // We also align the stride to a multiple of 16 bytes for efficiency.
- int buf_stride = ((width_ext + 3) & ~3) + 16;
-
- // Don't filter tiles with dimensions < 5 on any axis
- if ((width < 5) || (height < 5)) return;
-
- uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
- if (r == 1) {
- selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
- buf_stride);
- selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
- } else if (r == 2) {
- selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
- buf_stride);
- selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
- } else if (r == 3) {
- selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
- buf_stride);
- selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
- } else {
- assert(0);
- }
- A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
- B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
- {
- i = 0;
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
- A[k + buf_stride + 1];
- const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
- B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
- A[k + buf_stride - 1] + A[k + buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
- B[k + buf_stride - 1] + B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
- A[k + buf_stride - 1];
- const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
- B[k + buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- }
- for (i = 1; i < height - 1; ++i) {
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
- A[k + 1] + A[k - buf_stride + 1] +
- A[k + buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
- B[k + 1] + B[k - buf_stride + 1] +
- B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
-
- // Vectorize the innermost loop
- for (j = 1; j < width - 1; j += 4) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 5;
-
- __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
- __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
- __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
- __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
- __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
- __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
- __m128i a0 = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
- _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
- _mm_alignr_epi8(tmp5, tmp4, 4))),
- _mm_alignr_epi8(tmp1, tmp0, 4));
- __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
- _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
- _mm_alignr_epi8(tmp5, tmp4, 8)));
- __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
- __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
- __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
- __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
- __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
- __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
- __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
- __m128i b0 = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
- _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
- _mm_alignr_epi8(tmp11, tmp10, 4))),
- _mm_alignr_epi8(tmp7, tmp6, 4));
- __m128i b1 =
- _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
- _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
- _mm_alignr_epi8(tmp11, tmp10, 8)));
- __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
- __m128i src = _mm_cvtepu8_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
- __m128i rounding = _mm_set1_epi32(
- (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
- __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
__m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- _mm_storeu_si128((__m128i *)&dst[m], w);
- }
- // Deal with any extra pixels at the right-hand edge of the frame
- // (typically have 2 such pixels, but may have anywhere between 0 and 3)
- for (; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 5;
- const int32_t a =
- (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
- 4 +
- (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
- A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
- 3;
- const int32_t b =
- (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
- 4 +
- (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
- B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
- 3;
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
-
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
- A[k - 1] + A[k - buf_stride - 1] +
- A[k + buf_stride - 1];
- const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
- B[k - 1] + B[k - buf_stride - 1] +
- B[k + buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- }
-
- {
- i = height - 1;
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
- A[k - buf_stride + 1];
- const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
- B[k - buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
- A[k - buf_stride - 1] + A[k - buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
- B[k - buf_stride - 1] + B[k - buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
- A[k - buf_stride - 1];
- const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
- B[k - buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- }
-}
-
-void av1_highpass_filter_sse4_1(uint8_t *dgd, int width, int height, int stride,
- int32_t *dst, int dst_stride, int corner,
- int edge) {
- int i, j;
- const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
- {
- i = 0;
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
- corner *
- (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] = center * dgd[k] +
- edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
- corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
- dgd[k - 1] + dgd[k + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
- corner *
- (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
- }
- }
- {
- i = height - 1;
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
- corner *
- (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] = center * dgd[k] +
- edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
- corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
- dgd[k - 1] + dgd[k + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
- corner *
- (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
- }
- }
- __m128i center_ = _mm_set1_epi16(center);
- __m128i edge_ = _mm_set1_epi16(edge);
- __m128i corner_ = _mm_set1_epi16(corner);
- for (i = 1; i < height - 1; ++i) {
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
- corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
- dgd[k - stride] + dgd[k + stride]);
- }
- // Process in units of 8 pixels at a time.
- for (j = 1; j < width - 8; j += 8) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
-
- __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
- __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
- __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
- __m128i tl = _mm_cvtepu8_epi16(a);
- __m128i tr = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
- __m128i cl = _mm_cvtepu8_epi16(b);
- __m128i cr = _mm_cvtepu8_epi16(_mm_srli_si128(b, 8));
- __m128i bl = _mm_cvtepu8_epi16(c);
- __m128i br = _mm_cvtepu8_epi16(_mm_srli_si128(c, 8));
-
- __m128i x = _mm_alignr_epi8(cr, cl, 2);
- __m128i y = _mm_add_epi16(_mm_add_epi16(_mm_alignr_epi8(tr, tl, 2), cl),
- _mm_add_epi16(_mm_alignr_epi8(br, bl, 2),
- _mm_alignr_epi8(cr, cl, 4)));
- __m128i z = _mm_add_epi16(_mm_add_epi16(tl, bl),
- _mm_add_epi16(_mm_alignr_epi8(tr, tl, 4),
- _mm_alignr_epi8(br, bl, 4)));
-
- __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
- _mm_add_epi16(_mm_mullo_epi16(y, edge_),
- _mm_mullo_epi16(z, corner_)));
-
- _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
- _mm_storeu_si128((__m128i *)&dst[l + 4],
- _mm_cvtepi16_epi32(_mm_srli_si128(res, 8)));
- }
- // If there are enough pixels left in this row, do another batch of 4
- // pixels.
- for (; j < width - 4; j += 4) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
-
- __m128i a = _mm_loadl_epi64((__m128i *)&dgd[k - stride - 1]);
- __m128i b = _mm_loadl_epi64((__m128i *)&dgd[k - 1]);
- __m128i c = _mm_loadl_epi64((__m128i *)&dgd[k + stride - 1]);
-
- __m128i tl = _mm_cvtepu8_epi16(a);
- __m128i cl = _mm_cvtepu8_epi16(b);
- __m128i bl = _mm_cvtepu8_epi16(c);
-
- __m128i x = _mm_srli_si128(cl, 2);
- __m128i y = _mm_add_epi16(
- _mm_add_epi16(_mm_srli_si128(tl, 2), cl),
- _mm_add_epi16(_mm_srli_si128(bl, 2), _mm_srli_si128(cl, 4)));
- __m128i z = _mm_add_epi16(
- _mm_add_epi16(tl, bl),
- _mm_add_epi16(_mm_srli_si128(tl, 4), _mm_srli_si128(bl, 4)));
-
- __m128i res = _mm_add_epi16(_mm_mullo_epi16(x, center_),
- _mm_add_epi16(_mm_mullo_epi16(y, edge_),
- _mm_mullo_epi16(z, corner_)));
-
- _mm_storeu_si128((__m128i *)&dst[l], _mm_cvtepi16_epi32(res));
- }
- // Handle any leftover pixels
- for (; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
- corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
- dgd[k - stride + 1] + dgd[k + stride + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
- corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
- dgd[k - stride] + dgd[k + stride]);
+ xx_storeu_128(dst + i * dst_stride + j, w);
}
}
}
-void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
- int stride, int eps, int *xqd,
- uint8_t *dst, int dst_stride,
- int32_t *tmpbuf) {
- int xq[2];
- int32_t *flt1 = tmpbuf;
- int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int i, j;
- assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
- av1_highpass_filter_sse4_1(dat, width, height, stride, flt1, width,
- sgr_params[eps].corner, sgr_params[eps].edge);
-#else
- av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
- sgr_params[eps].r1, sgr_params[eps].e1);
-#endif // USE_HIGHPASS_IN_SGRPROJ
- av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
- sgr_params[eps].r2, sgr_params[eps].e2);
- decode_xq(xqd, xq);
-
- __m128i xq0 = _mm_set1_epi32(xq[0]);
- __m128i xq1 = _mm_set1_epi32(xq[1]);
- for (i = 0; i < height; ++i) {
- // Calculate output in batches of 8 pixels
- for (j = 0; j < width; j += 8) {
- const int k = i * width + j;
- const int l = i * stride + j;
- const int m = i * dst_stride + j;
- __m128i src =
- _mm_slli_epi16(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *)&dat[l])),
- SGRPROJ_RST_BITS);
-
- const __m128i u_0 = _mm_cvtepu16_epi32(src);
- const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
- const __m128i f1_0 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
- const __m128i f2_0 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
- const __m128i f1_1 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
- const __m128i f2_1 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
- const __m128i v_0 = _mm_add_epi32(
- _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
- _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
- const __m128i v_1 = _mm_add_epi32(
- _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
- _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
-
- const __m128i rounding =
- _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
- const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
- SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
- const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
- SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
-
- const __m128i tmp = _mm_packs_epi32(w_0, w_1);
- const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
- _mm_storel_epi64((__m128i *)&dst[m], res);
- }
- // Process leftover pixels
- for (; j < width; ++j) {
- const int k = i * width + j;
- const int l = i * stride + j;
- const int m = i * dst_stride + j;
- const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
- const int32_t f1 = (int32_t)flt1[k] - u;
- const int32_t f2 = (int32_t)flt2[k] - u;
- const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
- const int16_t w =
- (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
- dst[m] = (uint16_t)clip_pixel(w);
- }
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
}
-}
-
-#if CONFIG_HIGHBITDEPTH
-// Only the vertical sums need to be adjusted for highbitdepth
-static void highbd_selfguided_restoration_1_v(uint16_t *src, int width,
- int height, int src_stride,
- int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, x, y, x2, y2;
- __m128i sum, sum_sq, tmp;
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
- a = _mm_loadl_epi64((__m128i *)&src[j]);
- b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
- sum = _mm_cvtepi16_epi32(_mm_add_epi16(a, b));
- tmp = _mm_unpacklo_epi16(a, b);
- sum_sq = _mm_madd_epi16(tmp, tmp);
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
- for (i = 1; i < height - 2; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+ x_by_xplus1[_mm_extract_epi32(z, 2)],
+ x_by_xplus1[_mm_extract_epi32(z, 1)],
+ x_by_xplus1[_mm_extract_epi32(z, 0)]);
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
- y = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i + 2) * src_stride + j]));
+ xx_storeu_128(A + i * buf_stride + j, a_res);
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+ xx_storeu_128(B + i * buf_stride + j, b_res);
}
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
}
}
-static void highbd_selfguided_restoration_2_v(uint16_t *src, int width,
- int height, int src_stride,
- int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
-
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, c, c2, x, y, x2, y2;
- __m128i sum, sum_sq, tmp;
-
- a = _mm_loadl_epi64((__m128i *)&src[j]);
- b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
- c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
-
- sum = _mm_cvtepi16_epi32(_mm_add_epi16(_mm_add_epi16(a, b), c));
- // Important: We need to widen *before* squaring here, since
- // c^2 may be up to 2^24.
- c = _mm_cvtepu16_epi32(c);
- c2 = _mm_mullo_epi32(c, c);
- tmp = _mm_unpacklo_epi16(a, b);
- sum_sq = _mm_add_epi32(_mm_madd_epi16(tmp, tmp), c2);
-
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- for (i = 2; i < height - 3; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
- y = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i + 3) * src_stride + j]));
-
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
-
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
- }
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
- }
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fives =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+ const __m128i sixes = _mm_add_epi32(xt, xb);
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+
+ const __m128i fives = _mm_add_epi32(xl, xr);
+ const __m128i sixes = x;
+
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
}
-static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
- int height, int src_stride,
- int32_t *A, int32_t *B,
- int buf_stride) {
- int i, j;
-
- int width_extend = (width + 3) & ~3;
- for (j = 0; j < width_extend; j += 4) {
- __m128i a, b, c, d, x, y, x2, y2;
- __m128i sum, sum_sq, tmp, tmp2;
-
- a = _mm_loadl_epi64((__m128i *)&src[j]);
- b = _mm_loadl_epi64((__m128i *)&src[src_stride + j]);
- c = _mm_loadl_epi64((__m128i *)&src[2 * src_stride + j]);
- d = _mm_loadl_epi64((__m128i *)&src[3 * src_stride + j]);
-
- sum = _mm_cvtepi16_epi32(
- _mm_add_epi16(_mm_add_epi16(a, b), _mm_add_epi16(c, d)));
- tmp = _mm_unpacklo_epi16(a, b);
- tmp2 = _mm_unpacklo_epi16(c, d);
- sum_sq =
- _mm_add_epi32(_mm_madd_epi16(tmp, tmp), _mm_madd_epi16(tmp2, tmp2));
-
- _mm_store_si128((__m128i *)&B[j], sum);
- _mm_store_si128((__m128i *)&A[j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[4 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[5 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[2 * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[2 * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[6 * src_stride + j]));
- sum = _mm_add_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_add_epi32(sum_sq, x2);
-
- for (i = 3; i < height - 4; ++i) {
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
- y = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i + 4) * src_stride + j]));
-
- sum = _mm_add_epi32(sum, _mm_sub_epi32(y, x));
-
- x2 = _mm_mullo_epi32(x, x);
- y2 = _mm_mullo_epi32(y, y);
-
- sum_sq = _mm_add_epi32(sum_sq, _mm_sub_epi32(y2, x2));
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m128i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m128i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m128i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
}
- _mm_store_si128((__m128i *)&B[i * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[i * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 3) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 1) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 1) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 2) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 2) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 2) * buf_stride + j], sum_sq);
-
- x = _mm_cvtepu16_epi32(
- _mm_loadl_epi64((__m128i *)&src[(i - 1) * src_stride + j]));
- sum = _mm_sub_epi32(sum, x);
- x2 = _mm_mullo_epi32(x, x);
- sum_sq = _mm_sub_epi32(sum_sq, x2);
-
- _mm_store_si128((__m128i *)&B[(i + 3) * buf_stride + j], sum);
- _mm_store_si128((__m128i *)&A[(i + 3) * buf_stride + j], sum_sq);
}
}
-void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
- int height, int dgd_stride,
- int32_t *dst, int dst_stride,
- int bit_depth, int r, int eps) {
+void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride,
+ int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ DECLARE_ALIGNED(16, int32_t, buf[4 * RESTORATION_PROC_UNIT_PELS]);
+ memset(buf, 0, sizeof(buf));
+
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
- int32_t A_[RESTORATION_PROC_UNIT_PELS];
- int32_t B_[RESTORATION_PROC_UNIT_PELS];
- int32_t *A = A_;
- int32_t *B = B_;
- int i, j;
+
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width_ext + 3) & ~3) + 16;
- // Don't filter tiles with dimensions < 5 on any axis
- if ((width < 5) || (height < 5)) return;
-
- uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
- if (r == 1) {
- highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
- A, B, buf_stride);
- selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
- bit_depth);
- } else if (r == 2) {
- highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
- A, B, buf_stride);
- selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
- bit_depth);
- } else if (r == 3) {
- highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
- A, B, buf_stride);
- selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
- bit_depth);
- } else {
- assert(0);
- }
- A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
- B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
-
- {
- i = 0;
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
- A[k + buf_stride + 1];
- const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] +
- B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
- A[k + buf_stride - 1] + A[k + buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
- B[k + buf_stride - 1] + B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
- A[k + buf_stride - 1];
- const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] +
- B[k + buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- }
- for (i = 1; i < height - 1; ++i) {
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
- A[k + 1] + A[k - buf_stride + 1] +
- A[k + buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
- B[k + 1] + B[k - buf_stride + 1] +
- B[k + buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
-
- // Vectorize the innermost loop
- for (j = 1; j < width - 1; j += 4) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 5;
-
- __m128i tmp0 = _mm_loadu_si128((__m128i *)&A[k - 1 - buf_stride]);
- __m128i tmp1 = _mm_loadu_si128((__m128i *)&A[k + 3 - buf_stride]);
- __m128i tmp2 = _mm_loadu_si128((__m128i *)&A[k - 1]);
- __m128i tmp3 = _mm_loadu_si128((__m128i *)&A[k + 3]);
- __m128i tmp4 = _mm_loadu_si128((__m128i *)&A[k - 1 + buf_stride]);
- __m128i tmp5 = _mm_loadu_si128((__m128i *)&A[k + 3 + buf_stride]);
-
- __m128i a0 = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 4), tmp2),
- _mm_add_epi32(_mm_alignr_epi8(tmp3, tmp2, 8),
- _mm_alignr_epi8(tmp5, tmp4, 4))),
- _mm_alignr_epi8(tmp1, tmp0, 4));
- __m128i a1 = _mm_add_epi32(_mm_add_epi32(tmp0, tmp4),
- _mm_add_epi32(_mm_alignr_epi8(tmp1, tmp0, 8),
- _mm_alignr_epi8(tmp5, tmp4, 8)));
- __m128i a = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(a0, a1), 2), a1);
-
- __m128i tmp6 = _mm_loadu_si128((__m128i *)&B[k - 1 - buf_stride]);
- __m128i tmp7 = _mm_loadu_si128((__m128i *)&B[k + 3 - buf_stride]);
- __m128i tmp8 = _mm_loadu_si128((__m128i *)&B[k - 1]);
- __m128i tmp9 = _mm_loadu_si128((__m128i *)&B[k + 3]);
- __m128i tmp10 = _mm_loadu_si128((__m128i *)&B[k - 1 + buf_stride]);
- __m128i tmp11 = _mm_loadu_si128((__m128i *)&B[k + 3 + buf_stride]);
-
- __m128i b0 = _mm_add_epi32(
- _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 4), tmp8),
- _mm_add_epi32(_mm_alignr_epi8(tmp9, tmp8, 8),
- _mm_alignr_epi8(tmp11, tmp10, 4))),
- _mm_alignr_epi8(tmp7, tmp6, 4));
- __m128i b1 =
- _mm_add_epi32(_mm_add_epi32(tmp6, tmp10),
- _mm_add_epi32(_mm_alignr_epi8(tmp7, tmp6, 8),
- _mm_alignr_epi8(tmp11, tmp10, 8)));
- __m128i b = _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(b0, b1), 2), b1);
-
- __m128i src = _mm_cvtepu16_epi32(_mm_loadu_si128((__m128i *)&dgd[l]));
-
- __m128i rounding = _mm_set1_epi32(
- (1 << (SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS)) >> 1);
- __m128i v = _mm_add_epi32(_mm_mullo_epi32(a, src), b);
- __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
- SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- _mm_storeu_si128((__m128i *)&dst[m], w);
- }
-
- // Deal with any extra pixels at the right-hand edge of the frame
- // (typically have 2 such pixels, but may have anywhere between 0 and 3)
- for (; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 5;
- const int32_t a =
- (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
- 4 +
- (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
- A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
- 3;
- const int32_t b =
- (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
- 4 +
- (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
- B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
- 3;
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
-
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
- A[k - 1] + A[k - buf_stride - 1] +
- A[k + buf_stride - 1];
- const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
- B[k - 1] + B[k - buf_stride - 1] +
- B[k + buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+ int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
}
- {
- i = height - 1;
- j = 0;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
- A[k - buf_stride + 1];
- const int32_t b = 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] +
- B[k - buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
- A[k - buf_stride - 1] + A[k - buf_stride + 1];
- const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
- B[k - buf_stride - 1] + B[k - buf_stride + 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
- j = width - 1;
- {
- const int k = i * buf_stride + j;
- const int l = i * dgd_stride + j;
- const int m = i * dst_stride + j;
- const int nb = 3;
- const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
- A[k - buf_stride - 1];
- const int32_t b = 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] +
- B[k - buf_stride - 1];
- const int32_t v = a * dgd[l] + b;
- dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
- }
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
}
}
-void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
- int stride, int32_t *dst, int dst_stride,
- int corner, int edge) {
- int i, j;
- const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
-
- {
- i = 0;
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
- corner *
- (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] = center * dgd[k] +
- edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
- corner * (dgd[k + stride - 1] + dgd[k + stride + 1] +
- dgd[k - 1] + dgd[k + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
- corner *
- (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
- }
- }
- __m128i center_ = _mm_set1_epi32(center);
- __m128i edge_ = _mm_set1_epi32(edge);
- __m128i corner_ = _mm_set1_epi32(corner);
- for (i = 1; i < height - 1; ++i) {
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
- corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
- dgd[k - stride] + dgd[k + stride]);
- }
- // Process 4 pixels at a time
- for (j = 1; j < width - 4; j += 4) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
-
- __m128i a = _mm_loadu_si128((__m128i *)&dgd[k - stride - 1]);
- __m128i b = _mm_loadu_si128((__m128i *)&dgd[k - 1]);
- __m128i c = _mm_loadu_si128((__m128i *)&dgd[k + stride - 1]);
-
- __m128i tl = _mm_cvtepu16_epi32(a);
- __m128i tr = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
- __m128i cl = _mm_cvtepu16_epi32(b);
- __m128i cr = _mm_cvtepu16_epi32(_mm_srli_si128(b, 8));
- __m128i bl = _mm_cvtepu16_epi32(c);
- __m128i br = _mm_cvtepu16_epi32(_mm_srli_si128(c, 8));
-
- __m128i x = _mm_alignr_epi8(cr, cl, 4);
- __m128i y = _mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(tr, tl, 4), cl),
- _mm_add_epi32(_mm_alignr_epi8(br, bl, 4),
- _mm_alignr_epi8(cr, cl, 8)));
- __m128i z = _mm_add_epi32(_mm_add_epi32(tl, bl),
- _mm_add_epi32(_mm_alignr_epi8(tr, tl, 8),
- _mm_alignr_epi8(br, bl, 8)));
-
- __m128i res = _mm_add_epi32(_mm_mullo_epi32(x, center_),
- _mm_add_epi32(_mm_mullo_epi32(y, edge_),
- _mm_mullo_epi32(z, corner_)));
-
- _mm_storeu_si128((__m128i *)&dst[l], res);
- }
- // Handle any leftover pixels
- for (; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
- corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
- dgd[k - stride + 1] + dgd[k + stride + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] +
- edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
- corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
- dgd[k - stride] + dgd[k + stride]);
- }
- }
- {
- i = height - 1;
- j = 0;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
- corner *
- (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
- }
- for (j = 1; j < width - 1; ++j) {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] = center * dgd[k] +
- edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
- corner * (dgd[k - stride - 1] + dgd[k - stride + 1] +
- dgd[k - 1] + dgd[k + 1]);
- }
- j = width - 1;
- {
- const int k = i * stride + j;
- const int l = i * dst_stride + j;
- dst[l] =
- center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
- corner *
- (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
- }
- }
-}
-
-void apply_selfguided_restoration_highbd_sse4_1(
- uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
- int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ av1_selfguided_restoration_sse4_1(dat8, width, height, stride, flt0, flt1,
+ width, eps, bit_depth, highbd);
+ const sgr_params_type *const params = &sgr_params[eps];
int xq[2];
- int32_t *flt1 = tmpbuf;
- int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
- int i, j;
- assert(width * height <= RESTORATION_TILEPELS_MAX);
-#if USE_HIGHPASS_IN_SGRPROJ
- av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
- sgr_params[eps].corner,
- sgr_params[eps].edge);
-#else
- av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
- width, bit_depth, sgr_params[eps].r1,
- sgr_params[eps].e1);
-#endif // USE_HIGHPASS_IN_SGRPROJ
- av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
- width, bit_depth, sgr_params[eps].r2,
- sgr_params[eps].e2);
- decode_xq(xqd, xq);
+ decode_xq(xqd, xq, params);
__m128i xq0 = _mm_set1_epi32(xq[0]);
__m128i xq1 = _mm_set1_epi32(xq[1]);
- for (i = 0; i < height; ++i) {
+
+ for (int i = 0; i < height; ++i) {
// Calculate output in batches of 8 pixels
- for (j = 0; j < width; j += 8) {
+ for (int j = 0; j < width; j += 8) {
const int k = i * width + j;
- const int l = i * stride + j;
const int m = i * dst_stride + j;
- __m128i src =
- _mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
-
- const __m128i u_0 = _mm_cvtepu16_epi32(src);
- const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
-
- const __m128i f1_0 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
- const __m128i f2_0 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
- const __m128i f1_1 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
- const __m128i f2_1 =
- _mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
-
- const __m128i v_0 = _mm_add_epi32(
- _mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
- _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
- const __m128i v_1 = _mm_add_epi32(
- _mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
- _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m128i src;
+ if (highbd) {
+ src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ } else {
+ src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+ }
+
+ const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+ const __m128i u_0 = _mm_cvtepu16_epi32(u);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+ __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+ const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+ const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+ }
const __m128i rounding =
- _mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
- // Pack into 16 bits and clamp to [0, 2^bit_depth)
- const __m128i tmp = _mm_packus_epi32(w_0, w_1);
- const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
- const __m128i res = _mm_min_epi16(tmp, max);
-
- _mm_store_si128((__m128i *)&dst[m], res);
- }
- // Process leftover pixels
- for (; j < width; ++j) {
- const int k = i * width + j;
- const int l = i * stride + j;
- const int m = i * dst_stride + j;
- const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
- const int32_t f1 = (int32_t)flt1[k] - u;
- const int32_t f2 = (int32_t)flt2[k] - u;
- const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
- const int16_t w =
- (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
- dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+ const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+ const __m128i res = _mm_min_epi16(tmp, max);
+ xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+ const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+ xx_storel_64(dst8 + m, res);
+ }
}
}
}
-
-#endif
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
deleted file mode 100644
index d30466ae6..000000000
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/warped_motion.h"
-
-void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
- int height, int stride, uint8_t *pred, int p_col,
- int p_row, int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y,
- ConvolveParams *conv_params, int16_t alpha,
- int16_t beta, int16_t gamma, int16_t delta) {
- int comp_avg = conv_params->do_average;
- __m128i tmp[15];
- int i, j, k;
- const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
- use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz =
- use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
- if (use_conv_params) {
- conv_params->do_post_rounding = 1;
- }
- assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
-
- /* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
- */
- /*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
- }*/
-
- for (i = 0; i < p_height; i += 8) {
- for (j = 0; j < p_width; j += 8) {
- const int32_t src_x = (p_col + j + 4) << subsampling_x;
- const int32_t src_y = (p_row + i + 4) << subsampling_y;
- const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
- const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
- const int32_t x4 = dst_x >> subsampling_x;
- const int32_t y4 = dst_y >> subsampling_y;
-
- int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
- int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
- int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
- int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
- // Add in all the constant terms, including rounding and offset
- sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
- sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
- (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
-
- sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
- sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
-
- // Horizontal filter
- // If the block is aligned such that, after clamping, every sample
- // would be taken from the leftmost/rightmost column, then we can
- // skip the expensive horizontal filter.
- if (ix4 <= -7) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- }
- } else if (ix4 >= width + 6) {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
- }
- } else {
- for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
- int iy = iy4 + k;
- if (iy < 0)
- iy = 0;
- else if (iy > height - 1)
- iy = height - 1;
- int sx = sx4 + beta * (k + 4);
-
- // Load source pixels
- const __m128i zero = _mm_setzero_si128();
- const __m128i src =
- _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i round_const = _mm_set1_epi32(
- (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
- // Calculate filtered results
- const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Filter odd-index pixels
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
- _mm_cvtsi32_si128(reduce_bits_horiz));
-
- // Combine results into one register.
- // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
- // as this order helps with the vertical filter.
- tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
- }
- }
-
- // Vertical filter
- for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
- int sy = sy4 + delta * (k + 4);
-
- // Load from tmp and rearrange pairs of consecutive rows into the
- // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
- const __m128i *src = tmp + (k + 4);
- const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
- const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
- const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
- const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_2 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_4 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_6 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
- const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
- const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
- const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
-
- const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
- const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
- const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
- const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
- const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
- const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
-
- const __m128i tmp_1 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_3 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_5 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
- const __m128i tmp_7 = _mm_loadu_si128(
- (__m128i *)(warped_filter +
- ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
-
- const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
- const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
-
- const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
- const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
- const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-#if CONFIG_CONVOLVE_ROUND
- if (use_conv_params) {
- __m128i *const p =
- (__m128i *)&conv_params
- ->dst[(i + k + 4) * conv_params->dst_stride + j];
- const __m128i round_const = _mm_set1_epi32(
- -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
- ((1 << (conv_params->round_1)) >> 1));
- res_lo = _mm_add_epi32(res_lo, round_const);
- res_lo =
- _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
- _mm_storeu_si128(p, res_lo);
- if (p_width > 4) {
- res_hi = _mm_add_epi32(res_hi, round_const);
- res_hi =
- _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg)
- res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
- _mm_storeu_si128(p + 1, res_hi);
- }
- } else {
-#else
- {
-#endif
- // Round and pack into 8 bits
- const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
- ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
-
- const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
- // Store, blending with 'pred' if needed
- __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
-
- // Note: If we're outputting a 4x4 block, we need to be very careful
- // to only output 4 pixels at this point, to avoid encode/decode
- // mismatches when encoding with multiple threads.
- if (p_width == 4) {
- if (comp_avg) {
- const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
- res_8bit = _mm_avg_epu8(res_8bit, orig);
- }
- *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
- } else {
- if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
- _mm_storel_epi64(p, res_8bit);
- }
- }
- }
- }
- }
-}
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
index 3986ad389..efc542cbf 100644
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -9,9 +9,11 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
-#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
/* This is a modified version of 'warped_filter' from warped_motion.c:
@@ -201,41 +203,142 @@ static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
9, 11, 11, 13, 13, 15, 15, 0 };
-void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
- int height, int stride, uint8_t *pred, int p_col,
- int p_row, int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y,
- ConvolveParams *conv_params, int16_t alpha,
- int16_t beta, int16_t gamma, int16_t delta) {
- int comp_avg = conv_params->do_average;
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ const __m128i src_even =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+ const __m128i src_odd =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
__m128i tmp[15];
int i, j, k;
const int bd = 8;
-#if CONFIG_CONVOLVE_ROUND
- const int use_conv_params = conv_params->round == CONVOLVE_OPT_NO_ROUND;
- const int reduce_bits_horiz =
- use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz =
- use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
- if (use_conv_params) {
- conv_params->do_post_rounding = 1;
- }
- assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
-#else
- const int reduce_bits_horiz = HORSHEAR_REDUCE_PREC_BITS;
- const int offset_bits_horiz = bd + WARPEDPIXEL_FILTER_BITS - 1;
-#endif
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const __m128i res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
/* Note: For this code to work, the left/right frame borders need to be
- extended by at least 13 pixels each. By the time we get here, other
- code will have set up this border, but we allow an explicit check
- for debugging purposes.
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
*/
/*for (i = 0; i < height; ++i) {
- for (j = 0; j < 13; ++j) {
- assert(ref[i * stride - 13 + j] == ref[i * stride]);
- assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
- }
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
}*/
for (i = 0; i < p_height; i += 8) {
@@ -273,10 +376,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1)
iy = height - 1;
tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
}
} else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -285,11 +386,37 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
iy = 0;
else if (iy > height - 1)
iy = height - 1;
- tmp[k + 7] = _mm_set1_epi16(
- (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
- 1)) +
- ref[iy * stride + (width - 1)] *
- (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
}
} else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
@@ -303,89 +430,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Load source pixels
const __m128i src =
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
- const __m128i src_even =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
- const __m128i src_odd =
- _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
-
- // Filter even-index pixels
- const __m128i tmp_0 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_1 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_2 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_3 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_4 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_5 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_6 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
- const __m128i tmp_7 = _mm_loadl_epi64((
- __m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
-
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
- const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
- const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
- const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
- // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
- const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
-
- // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
- const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
- // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
- const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
- // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
- const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
- // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
- const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
-
- // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_02 = _mm_unpacklo_epi64(tmp_12, tmp_14);
- // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_46 = _mm_unpackhi_epi64(tmp_12, tmp_14);
- // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_13 = _mm_unpacklo_epi64(tmp_13, tmp_15);
- // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
- const __m128i coeff_57 = _mm_unpackhi_epi64(tmp_13, tmp_15);
-
- // The pixel order we need for 'src' is:
- // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
- const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
- const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff_02);
- // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
- const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
- _mm_srli_si128(src_odd, 4));
- const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff_46);
- // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
- const __m128i src_13 =
- _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
- const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff_13);
- // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
- const __m128i src_57 = _mm_unpacklo_epi64(
- _mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6));
- const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
-
- const __m128i round_const = _mm_set1_epi16(
- (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
-
- // Note: The values res_02 + res_46 and res_13 + res_57 both
- // fit into int16s at this point, but their sum may be too wide to fit
- // into an int16. However, once we also add round_const, the sum of
- // all of these fits into a uint16.
- //
- // The wrapping behaviour of _mm_add_* is used here to make sure we
- // get the correct result despite converting between different
- // (implicit) types.
- const __m128i res_even = _mm_add_epi16(res_02, res_46);
- const __m128i res_odd = _mm_add_epi16(res_13, res_57);
- const __m128i res =
- _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
- tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
}
}
@@ -474,40 +520,85 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
__m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
__m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-#if CONFIG_CONVOLVE_ROUND
- if (use_conv_params) {
+ if (conv_params->is_compound) {
__m128i *const p =
(__m128i *)&conv_params
->dst[(i + k + 4) * conv_params->dst_stride + j];
- const __m128i round_const = _mm_set1_epi32(
- -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
- ((1 << (conv_params->round_1)) >> 1));
- res_lo = _mm_add_epi32(res_lo, round_const);
- res_lo =
- _mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
- _mm_storeu_si128(p, res_lo);
+ res_lo = _mm_add_epi32(res_lo, res_add_const);
+ res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo, res_lo);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, res_sub_const);
+
+ res_lo_16 = _mm_sra_epi16(
+ _mm_add_epi16(res_lo_16, round_bits_const), round_bits_shift);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
if (p_width > 4) {
- res_hi = _mm_add_epi32(res_hi, round_const);
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = _mm_add_epi32(res_hi, res_add_const);
res_hi =
- _mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
- if (comp_avg)
- res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
- _mm_storeu_si128(p + 1, res_hi);
+ _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi, res_hi);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, res_sub_const);
+
+ res_hi_16 = _mm_sra_epi16(
+ _mm_add_epi16(res_hi_16, round_bits_const), round_bits_shift);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
}
} else {
-#else
- {
-#endif
// Round and pack into 8 bits
const __m128i round_const =
- _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
- ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
__m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
@@ -519,13 +610,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// to only output 4 pixels at this point, to avoid encode/decode
// mismatches when encoding with multiple threads.
if (p_width == 4) {
- if (comp_avg) {
- const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
- res_8bit = _mm_avg_epu8(res_8bit, orig);
- }
*(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
} else {
- if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
_mm_storel_epi64(p, res_8bit);
}
}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 000000000..e1449fd21
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+ const __m256i clamp_high =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ /* Horizontal filter */
+ {
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+ // Load 8-bit src data
+ const __m128i data_0 = xx_loadu_128(data_ij + 0);
+ const __m128i data_1 = xx_loadu_128(data_ij + 1);
+ const __m128i data_2 = xx_loadu_128(data_ij + 2);
+ const __m128i data_3 = xx_loadu_128(data_ij + 3);
+ const __m128i data_4 = xx_loadu_128(data_ij + 4);
+ const __m128i data_5 = xx_loadu_128(data_ij + 5);
+ const __m128i data_6 = xx_loadu_128(data_ij + 6);
+ const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+ // (Zero-)Extend 8-bit data to 16-bit data
+ const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+ const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+ const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+ const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+ const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+ const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+ const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+ const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+ // Reduce to 8-bit precision. This messes up the order:
+ // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+ // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit =
+ _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+ // Swap the two central 32-bit values to get the order:
+ // [ - - - - - - - - - - - - - - - - ]
+ // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+ // Store the lower 128-bit lane in the dst array
+ xx_storeu_128(dst + i * dst_stride + j,
+ _mm256_castsi256_si128(res_8bit2));
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 000000000..3083d224b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(
+ _mm_max_epi16(res, zero),
+ _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+}