summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1/common/x86')
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_ssse3.c16
-rw-r--r--third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c7
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm1d_sse4.h2
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c367
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c342
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c372
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c5
-rw-r--r--third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c3
-rw-r--r--third_party/aom/av1/common/x86/idct_intrin_sse2.c27
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c4
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse2.c7
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_ssse3.c7
12 files changed, 1130 insertions, 29 deletions
diff --git a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
index 5e627ebcf..e85c15eaf 100644
--- a/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
+++ b/third_party/aom/av1/common/x86/av1_convolve_ssse3.c
@@ -676,11 +676,12 @@ void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i verf[6];
__m128i horf[2];
SubpelFilterCoeffs hCoeffs, vCoeffs;
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
const uint8_t *src_ptr;
- store_pixel_t store2p = store2pixelTab[conv_params->ref];
- store_pixel_t store4p = store4pixelTab[conv_params->ref];
- transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
- transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
+ store_pixel_t store2p = store2pixelTab[conv_params->do_average];
+ store_pixel_t store4p = store4pixelTab[conv_params->do_average];
+ transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->do_average];
+ transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->do_average];
const int tapsNum = filter_params.taps;
int block_height, block_residu;
@@ -890,10 +891,11 @@ void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
__m128i verf[6];
SubpelFilterCoeffs vCoeffs;
const uint8_t *src_ptr;
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
uint8_t *dst_ptr = dst;
- store_pixel_t store2p = store2pixelTab[conv_params->ref];
- store_pixel_t store4p = store4pixelTab[conv_params->ref];
- store_pixel_t store8p = store8pixelTab[conv_params->ref];
+ store_pixel_t store2p = store2pixelTab[conv_params->do_average];
+ store_pixel_t store4p = store4pixelTab[conv_params->do_average];
+ store_pixel_t store8p = store8pixelTab[conv_params->do_average];
const int tapsNum = filter_params.taps;
if (0 == subpel_y_q4 || 16 != y_step_q4) {
diff --git a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 1d7c55349..f7824b627 100644
--- a/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/third_party/aom/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -40,7 +40,12 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
const int stride,
const TXFM_2D_FLIP_CFG *cfg,
int32_t *txfm_buf) {
- // TODO(sarahparker) must correct for rectangular transforms in follow up
+ // TODO(sarahparker) This does not currently support rectangular transforms
+ // and will break without splitting txfm_size out into row and col size.
+ // Rectangular transforms use c code only, so it should be ok for now.
+ // It will be corrected when there are sse implementations for rectangular
+ // transforms.
+ assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
const int txfm_size = cfg->row_cfg->txfm_size;
const int8_t *shift = cfg->row_cfg->shift;
const int8_t *stage_range_col = cfg->col_cfg->stage_range;
diff --git a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
index af7afb7ee..fd0a6ed2c 100644
--- a/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
+++ b/third_party/aom/av1/common/x86/av1_txfm1d_sse4.h
@@ -64,7 +64,7 @@ static INLINE void transpose_32_4x4(int stride, const __m128i *input,
// the entire input block can be represent by a grid of 4x4 blocks
// each 4x4 blocks can be represent by 4 vertical __m128i
// we first transpose each 4x4 block internally
-// than transpose the grid
+// then transpose the grid
static INLINE void transpose_32(int txfm_size, const __m128i *input,
__m128i *output) {
const int num_per_128 = 4;
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 000000000..46c2674ca
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_COMPOUND_ROUND
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ DECLARE_ALIGNED(16, uint8_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_packus_epi16(res, res);
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint8_t *data = &im_block[i * im_stride + j];
+ const __m128i src_01 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * im_stride)));
+ const __m128i src_23 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * im_stride)));
+ const __m128i src_45 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * im_stride)));
+ const __m128i src_67 = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 6 * im_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 7 * im_stride)));
+
+ const __m128i src_0 = _mm_unpacklo_epi8(src_01, zero);
+ const __m128i src_2 = _mm_unpacklo_epi8(src_23, zero);
+ const __m128i src_4 = _mm_unpacklo_epi8(src_45, zero);
+ const __m128i src_6 = _mm_unpacklo_epi8(src_67, zero);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi8(src_01, zero);
+ const __m128i src_3 = _mm_unpackhi_epi8(src_23, zero);
+ const __m128i src_5 = _mm_unpackhi_epi8(src_45, zero);
+ const __m128i src_7 = _mm_unpackhi_epi8(src_67, zero);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ }
+ }
+ }
+}
+#else
+void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
+ InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ }
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 000000000..a0e58716d
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "./av1_rtcd.h"
+
+#if CONFIG_CONVOLVE_ROUND
+static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+// 16 epi16 pixels
+static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(*u, max);
+ clamped = _mm256_andnot_si256(mask, *u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+
+ const __m256i zero = _mm256_setzero_si256();
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ *u = _mm256_and_si256(clamped, mask);
+}
+
+// 8 epi16 pixels
+static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(*u, max);
+ clamped = _mm_andnot_si128(mask, *u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+
+ const __m128i zero = _mm_setzero_si128();
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ *u = _mm_and_si128(clamped, mask);
+}
+
+// Work on multiple of 32 pixels
+static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
+ const __m256i *rnd, int shift,
+ int num) {
+ do {
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+ __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+ x0 = _mm256_add_epi32(x0, *rnd);
+ x1 = _mm256_add_epi32(x1, *rnd);
+ x2 = _mm256_add_epi32(x2, *rnd);
+ x3 = _mm256_add_epi32(x3, *rnd);
+
+ x0 = _mm256_srai_epi32(x0, shift);
+ x1 = _mm256_srai_epi32(x1, shift);
+ x2 = _mm256_srai_epi32(x2, shift);
+ x3 = _mm256_srai_epi32(x3, shift);
+
+ x0 = _mm256_packs_epi32(x0, x1);
+ x2 = _mm256_packs_epi32(x2, x3);
+
+ pixel_clamp_avx2(&x0, 8);
+ pixel_clamp_avx2(&x2, 8);
+
+ x0 = _mm256_packus_epi16(x0, x2);
+ x1 = _mm256_loadu_si256((const __m256i *)sindex);
+ x2 = _mm256_permutevar8x32_epi32(x0, x1);
+
+ _mm256_storeu_si256((__m256i *)dst, x2);
+ src += 32;
+ dst += 32;
+ num--;
+ } while (num > 0);
+}
+
+static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
+ const __m256i *rnd, int shift) {
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+ x0 = _mm256_add_epi32(x0, *rnd);
+ x1 = _mm256_add_epi32(x1, *rnd);
+
+ x0 = _mm256_srai_epi32(x0, shift);
+ x1 = _mm256_srai_epi32(x1, shift);
+
+ x0 = _mm256_packs_epi32(x0, x1);
+ pixel_clamp_avx2(&x0, 8);
+
+ const __m256i x2 = _mm256_packus_epi16(x0, x0);
+ x1 = _mm256_loadu_si256((const __m256i *)sindex);
+ x0 = _mm256_permutevar8x32_epi32(x2, x1);
+
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
+ const __m256i *rnd, int shift) {
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+ x0 = _mm256_add_epi32(x0, *rnd);
+ x0 = _mm256_srai_epi32(x0, shift);
+
+ x0 = _mm256_packs_epi32(x0, x0);
+ pixel_clamp_avx2(&x0, 8);
+
+ x0 = _mm256_packus_epi16(x0, x0);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
+ x0 = _mm256_permutevar8x32_epi32(x0, x1);
+
+ _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
+}
+
+static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
+ const __m128i *rnd, int shift) {
+ __m128i x = _mm_loadu_si128((const __m128i *)src);
+ x = _mm_add_epi32(x, *rnd);
+ x = _mm_srai_epi32(x, shift);
+
+ x = _mm_packs_epi32(x, x);
+ pixel_clamp_sse2(&x, 8);
+
+ x = _mm_packus_epi16(x, x);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(x);
+}
+
+void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ int bits) {
+ const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+ const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+ if (w > 64) { // width = 128
+ do {
+ cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 32) { // width = 64
+ do {
+ cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // width = 32
+ do {
+ cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // width = 16
+ do {
+ cal_rounding_16_avx2(src, dst, &rnd_num, bits);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 4) { // width = 8
+ do {
+ cal_rounding_8_avx2(src, dst, &rnd_num, bits);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 2) { // width = 4
+ do {
+ cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else { // width = 2
+ do {
+ dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
+ dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ }
+}
+
+#if CONFIG_HIGHBITDEPTH
+static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
+ uint16_t *dst,
+ const __m256i *rnd, int shift,
+ int num, int bd) {
+ do {
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+ __m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
+ __m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
+
+ x0 = _mm256_add_epi32(x0, *rnd);
+ x1 = _mm256_add_epi32(x1, *rnd);
+ x2 = _mm256_add_epi32(x2, *rnd);
+ x3 = _mm256_add_epi32(x3, *rnd);
+
+ x0 = _mm256_srai_epi32(x0, shift);
+ x1 = _mm256_srai_epi32(x1, shift);
+ x2 = _mm256_srai_epi32(x2, shift);
+ x3 = _mm256_srai_epi32(x3, shift);
+
+ x0 = _mm256_packs_epi32(x0, x1);
+ x2 = _mm256_packs_epi32(x2, x3);
+
+ pixel_clamp_avx2(&x0, bd);
+ pixel_clamp_avx2(&x2, bd);
+
+ x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+ x2 = _mm256_permute4x64_epi64(x2, 0xD8);
+
+ _mm256_storeu_si256((__m256i *)dst, x0);
+ _mm256_storeu_si256((__m256i *)(dst + 16), x2);
+ src += 32;
+ dst += 32;
+ num--;
+ } while (num > 0);
+}
+
+static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
+ uint16_t *dst,
+ const __m256i *rnd, int shift,
+ int bd) {
+ __m256i x0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
+
+ x0 = _mm256_add_epi32(x0, *rnd);
+ x1 = _mm256_add_epi32(x1, *rnd);
+
+ x0 = _mm256_srai_epi32(x0, shift);
+ x1 = _mm256_srai_epi32(x1, shift);
+
+ x0 = _mm256_packs_epi32(x0, x1);
+ pixel_clamp_avx2(&x0, bd);
+
+ x0 = _mm256_permute4x64_epi64(x0, 0xD8);
+ _mm256_storeu_si256((__m256i *)dst, x0);
+}
+
+static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
+ const __m256i *rnd, int shift,
+ int bd) {
+ __m256i x = _mm256_loadu_si256((const __m256i *)src);
+ x = _mm256_add_epi32(x, *rnd);
+ x = _mm256_srai_epi32(x, shift);
+
+ x = _mm256_packs_epi32(x, x);
+ pixel_clamp_avx2(&x, bd);
+
+ x = _mm256_permute4x64_epi64(x, 0xD8);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
+}
+
+static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
+ const __m128i *rnd, int shift,
+ int bd) {
+ __m128i x = _mm_loadu_si128((const __m128i *)src);
+ x = _mm_add_epi32(x, *rnd);
+ x = _mm_srai_epi32(x, shift);
+
+ x = _mm_packs_epi32(x, x);
+ pixel_clamp_sse2(&x, bd);
+ _mm_storel_epi64((__m128i *)dst, x);
+}
+
+void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
+ uint8_t *dst8, int dst_stride, int w,
+ int h, int bits, int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
+ const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
+
+ if (w > 64) { // width = 128
+ do {
+ cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 32) { // width = 64
+ do {
+ cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 16) { // width = 32
+ do {
+ cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 8) { // width = 16
+ do {
+ cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 4) { // width = 8
+ do {
+ cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else if (w > 2) { // width = 4
+ do {
+ cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ } else { // width = 2
+ do {
+ dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
+ dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
+ src += src_stride;
+ dst += dst_stride;
+ h--;
+ } while (h > 0);
+ }
+}
+#endif // CONFIG_HIGHBITDEPTH
+#endif // CONFIG_CONVOLVE_ROUND
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 000000000..ff4a0a0fe
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+#if CONFIG_COMPOUND_ROUND
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_max_epi16(_mm_min_epi16(res, maxval), _mm_setzero_si128());
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ }
+ }
+ }
+}
+#else
+void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
+ CONV_BUF_TYPE *dst, int dst_stride, int w,
+ int h, InterpFilterParams *filter_params_x,
+ InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 15-bit intermediate array.
+ assert(conv_params->round_0 >= 5);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
+ _mm_storeu_si128(p + 1,
+ _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
+ }
+ }
+ }
+}
+#endif
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
index 37e2f61e7..35d637f72 100644
--- a/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_ssse3.c
@@ -19,8 +19,9 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
uint16_t *pred, int p_col, int p_row,
int p_width, int p_height, int p_stride,
int subsampling_x, int subsampling_y, int bd,
- int comp_avg, int16_t alpha, int16_t beta,
- int16_t gamma, int16_t delta) {
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int comp_avg = conv_params->do_average;
#if HORSHEAR_REDUCE_PREC_BITS >= 5
__m128i tmp[15];
#else
diff --git a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
index c69614e42..0648b95b3 100644
--- a/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
+++ b/third_party/aom/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -364,8 +364,9 @@ static void iidtx16(__m256i *in) {
#endif
void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m256i in[16];
+ int tx_type = txfm_param->tx_type;
load_buffer_16x16(input, in);
switch (tx_type) {
diff --git a/third_party/aom/av1/common/x86/idct_intrin_sse2.c b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
index d6a598746..bf12a26d3 100644
--- a/third_party/aom/av1/common/x86/idct_intrin_sse2.c
+++ b/third_party/aom/av1/common/x86/idct_intrin_sse2.c
@@ -59,10 +59,11 @@ static INLINE void fliplr_16x8(__m128i *in /*in[16]*/) {
#endif
void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- int tx_type) {
+ const TxfmParam *txfm_param) {
__m128i in[2];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
+ int tx_type = txfm_param->tx_type;
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8);
@@ -150,10 +151,11 @@ void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
}
void av1_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- int tx_type) {
+ const TxfmParam *txfm_param) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ int tx_type = txfm_param->tx_type;
// load input data
in[0] = load_input_data(input);
@@ -251,10 +253,11 @@ static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
#endif // CONFIG_EXT_TX
void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m128i in[32];
__m128i *in0 = &in[0];
__m128i *in1 = &in[16];
+ int tx_type = txfm_param->tx_type;
load_buffer_8x16(input, in0);
input += 8;
@@ -388,8 +391,9 @@ static INLINE void flip_buffer_lr_8x8(__m128i *in) {
#endif // CONFIG_EXT_TX
void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m128i in[16];
+ int tx_type = txfm_param->tx_type;
in[0] = load_input_data(input + 0 * 8);
in[1] = load_input_data(input + 1 * 8);
@@ -553,8 +557,9 @@ static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
}
void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m128i in[16];
+ int tx_type = txfm_param->tx_type;
// Transpose 16x8 input into in[]
in[0] = load_input_data(input + 0 * 16);
@@ -713,8 +718,9 @@ static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
}
void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- int tx_type) {
+ const TxfmParam *txfm_param) {
__m128i in[8];
+ int tx_type = txfm_param->tx_type;
in[0] = load_input_data(input + 0 * 8);
in[1] = load_input_data(input + 1 * 8);
@@ -897,8 +903,9 @@ static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
}
void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
- int tx_type) {
+ const TxfmParam *txfm_param) {
__m128i in[8];
+ int tx_type = txfm_param->tx_type;
// Load rows, packed two per element of 'in'.
// We pack into the bottom half of 'in' so that the
@@ -1119,8 +1126,9 @@ static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
}
void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m128i intl[16], intr[16], inbl[16], inbr[16];
+ int tx_type = txfm_param->tx_type;
int i;
for (i = 0; i < 16; ++i) {
@@ -1272,8 +1280,9 @@ static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
}
void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
+ int stride, const TxfmParam *txfm_param) {
__m128i in0[16], in1[16], in2[16], in3[16];
+ int tx_type = txfm_param->tx_type;
int i;
for (i = 0; i < 16; ++i) {
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
index 260faa8c9..e2e4f51c3 100644
--- a/third_party/aom/av1/common/x86/selfguided_sse4.c
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -16,8 +16,8 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
if (bit_depth > 8) {
__m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
__m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
- __m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
- __m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
+ __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
a = _mm_mullo_epi32(a, n);
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse2.c b/third_party/aom/av1/common/x86/warp_plane_sse2.c
index cdc4e8d0f..5a22d9abf 100644
--- a/third_party/aom/av1/common/x86/warp_plane_sse2.c
+++ b/third_party/aom/av1/common/x86/warp_plane_sse2.c
@@ -17,9 +17,10 @@
void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y, int comp_avg,
- int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta) {
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int comp_avg = conv_params->do_average;
__m128i tmp[15];
int i, j, k;
const int bd = 8;
diff --git a/third_party/aom/av1/common/x86/warp_plane_ssse3.c b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
index 494410e99..f8e6f62ba 100644
--- a/third_party/aom/av1/common/x86/warp_plane_ssse3.c
+++ b/third_party/aom/av1/common/x86/warp_plane_ssse3.c
@@ -204,9 +204,10 @@ static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
int height, int stride, uint8_t *pred, int p_col,
int p_row, int p_width, int p_height, int p_stride,
- int subsampling_x, int subsampling_y, int comp_avg,
- int16_t alpha, int16_t beta, int16_t gamma,
- int16_t delta) {
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int comp_avg = conv_params->do_average;
__m128i tmp[15];
int i, j, k;
const int bd = 8;